@inproceedings{ef7f635b5aee4533bf78f77e36e83f94,
title = "Dataset Culling: Towards Efficient Training of Distillation-Based Domain Specific Models",
abstract = "Real-time CNN-based object detection models for applications like surveillance can achieve high accuracy but are computationally expensive. Recent works have shown 10 to 100× reduction in computation cost for inference by using domain-specific networks. However, prior works have focused on inference only. If the domain model requires frequent retraining, training costs can pose a significant bottleneck. To address this, we propose Dataset Culling: a pipeline to reduce the size of the dataset for training, based on the prediction difficulty. Images that are easy to classify are filtered out since they contribute little to improving the accuracy. The difficulty is measured using our proposed confidence loss metric with little computational overhead. Dataset Culling is extended to optimize the image resolution to further improve training and inference costs. We develop fixed-angle, long-duration video datasets across several domains, and we show that the dataset size can be culled by a factor of 300× to reduce the total training time by 47× with no accuracy loss or even with slight improvement.1",
keywords = "Dataset Culling, Deep Learning, Distillation, Object Detection, Training Efficiency",
author = "Kentaro Yoshioka and Edward Lee and Simon Wong and Mark Horowitz",
note = "Publisher Copyright: {\textcopyright} 2019 IEEE.; 26th IEEE International Conference on Image Processing, ICIP 2019 ; Conference date: 22-09-2019 Through 25-09-2019",
year = "2019",
month = sep,
doi = "10.1109/ICIP.2019.8803462",
language = "English",
series = "Proceedings - International Conference on Image Processing, ICIP",
publisher = "IEEE Computer Society",
pages = "3237--3241",
booktitle = "2019 IEEE International Conference on Image Processing, ICIP 2019 - Proceedings",
}