@ARTICLE{Fan_Chengkai_Preprocessing_2022, author={Fan, Chengkai and Zhang, Na and Jiang, Bei and Liu, Wei Victor}, volume={vol. 67}, number={No 4}, journal={Archives of Mining Sciences}, pages={661-680}, howpublished={online}, year={2022}, publisher={Committee of Mining PAS}, abstract={The historical datasets at operating mine sites are usually large. Directly applying large datasets to build prediction models may lead to inaccurate results. To overcome the real-world challenges, this study aimed to handle these large datasets using Gaussian mixture modelling (GMM) for developing a novel and accurate prediction model of truck productivity. A large dataset of truck haulage collected at operating mine sites was clustered by GMM into three latent classes before the prediction model was built. The labels of these latent classes generated a latent variable. Two multiple linear regression (MLR) models were then constructed, including the ordinary-MLR (O-MLR) and the hybrid GMM-MLR models. The GMM-MLR model incorporated the observed input variables and a latent variable in the form of interaction terms. The O-MLR model was the baseline model and did not involve the latent variable. The GMM-MLR model performed considerably better than the O-MLR model in predicting truck productivity. The interaction terms quantitatively measured the differences in how the observed input variables affected truck productivity in three classes (high, medium, and low truck productivity). The haul distance was the most crucial input variable in the GMM-MLR model. This study provides new insights into handling massive amounts of data in truck haulage datasets and a more accurate prediction model for truck productivity.}, type={Article}, title={Preprocessing Large Datasets Using Gaussian Mixture Modelling to Improve Prediction Accuracy of Truck Productivity at Mine Sites}, URL={http://journals.pan.pl/Content/125534/PDF-MASTER/Archiwum-67-4-05-Bei-Jiang.pdf}, doi={10.24425/ams.2022.143680}, keywords={Oil sands mining, Mine truck productivity, Gaussian mixture model, Latent variable, Prediction accuracy, Relative importance}, }