package codemining.lm.util;

import codemining.languagetools.ITokenizer;
import codemining.util.parallel.ParallelThreadPool;
import com.google.common.collect.ConcurrentHashMultiset;
import com.google.common.collect.Multiset;
import java.io.File;
import java.io.IOException;
import java.util.Collection;
import java.util.Iterator;
import java.util.Set;
import java.util.Stack;
import java.util.logging.Logger;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang.exception.ExceptionUtils;

/* JADX WARN: Classes with same name are omitted:
  input_file:lib/naturalize.jar:codemining/lm/util/VocabularyBuildingUtility.class
 */
/* loaded from: input_file:naturalize.jar:codemining/lm/util/VocabularyBuildingUtility.class */
public class VocabularyBuildingUtility {
    private static final Logger LOGGER = Logger.getLogger(VocabularyBuildingUtility.class.getName());

    /* JADX WARN: Classes with same name are omitted:
      input_file:lib/naturalize.jar:codemining/lm/util/VocabularyBuildingUtility$VocabularyExtractorRunnable.class
     */
    /* loaded from: input_file:naturalize.jar:codemining/lm/util/VocabularyBuildingUtility$VocabularyExtractorRunnable.class */
    private static class VocabularyExtractorRunnable implements Runnable {
        final File codeFile;
        final Multiset<String> vocabularySet;
        final ITokenizer tokenizer;

        public VocabularyExtractorRunnable(File file, Multiset<String> multiset, ITokenizer iTokenizer) {
            this.codeFile = file;
            this.vocabularySet = multiset;
            this.tokenizer = iTokenizer;
        }

        @Override // java.lang.Runnable
        public void run() {
            VocabularyBuildingUtility.LOGGER.finer("Reading file " + this.codeFile.getAbsolutePath());
            try {
                this.vocabularySet.addAll(this.tokenizer.tokenListFromCode(FileUtils.readFileToString(this.codeFile).toCharArray()));
            } catch (IOException e) {
                VocabularyBuildingUtility.LOGGER.warning(ExceptionUtils.getFullStackTrace(e));
            }
        }
    }

    public static Set<String> buildVocabulary(Collection<File> collection, ITokenizer iTokenizer, int i) {
        ConcurrentHashMultiset create = ConcurrentHashMultiset.create();
        ParallelThreadPool parallelThreadPool = new ParallelThreadPool();
        Iterator<File> it = collection.iterator();
        while (it.hasNext()) {
            parallelThreadPool.pushTask(new VocabularyExtractorRunnable(it.next(), create, iTokenizer));
        }
        parallelThreadPool.waitForTermination();
        Stack stack = new Stack();
        Iterator it2 = create.entrySet().iterator();
        while (it2.hasNext()) {
            Multiset.Entry entry = (Multiset.Entry) it2.next();
            if (entry.getCount() <= i) {
                stack.add(entry);
            }
        }
        Iterator it3 = stack.iterator();
        while (it3.hasNext()) {
            Multiset.Entry entry2 = (Multiset.Entry) it3.next();
            create.remove(entry2.getElement(), entry2.getCount());
        }
        LOGGER.info("Vocabulary built, with " + create.elementSet().size() + " words");
        return create.elementSet();
    }

    private VocabularyBuildingUtility() {
    }
}
