package cc.mallet.util;

import cc.mallet.pipe.CharSequenceLowercase;
import cc.mallet.pipe.FeatureCountPipe;
import cc.mallet.pipe.FeatureDocFreqPipe;
import cc.mallet.pipe.SerialPipes;
import cc.mallet.pipe.SimpleTokenizer;
import cc.mallet.pipe.StringList2FeatureSequence;
import cc.mallet.pipe.iterator.CsvIterator;
import cc.mallet.types.Alphabet;
import cc.mallet.types.Instance;
import cc.mallet.types.InstanceList;
import cc.mallet.util.CommandOption;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import org.eclipse.wst.jsdt.internal.oaametadata.IOAAMetaDataConstants;

/* loaded from: input_file:cc/mallet/util/BulkLoader.class */
public class BulkLoader {
    static CommandOption.File inputFile = new CommandOption.File(BulkLoader.class, "input", "FILE", true, null, "The file containing data, one instance per line", null);
    static CommandOption.File outputFile = new CommandOption.File(BulkLoader.class, "output", "FILE", true, new File("mallet.data"), "Write the instance list to this file", null);
    static CommandOption.Boolean preserveCase = new CommandOption.Boolean(BulkLoader.class, "preserve-case", "[TRUE|FALSE]", false, false, "If true, do not force all strings to lowercase.", null);
    static CommandOption.Boolean removeStopWords = new CommandOption.Boolean(BulkLoader.class, "remove-stopwords", "[TRUE|FALSE]", false, false, "If true, remove common \"stop words\" from the text.\nThis option invokes a minimal English stoplist. ", null);
    static CommandOption.File stoplistFile = new CommandOption.File(BulkLoader.class, "stoplist", "FILE", true, null, "Read newline-separated words from this file,\n   and remove them from text. This option overrides\n   the default English stoplist triggered by --remove-stopwords.", null);
    static CommandOption.Boolean keepSequence = new CommandOption.Boolean(BulkLoader.class, "keep-sequence", "[TRUE|FALSE]", false, false, "If true, final data will be a FeatureSequence rather than a FeatureVector.", null);
    static CommandOption.String lineRegex = new CommandOption.String(BulkLoader.class, "line-regex", "REGEX", true, "^([^\\t]*)\\t([^\\t]*)\\t(.*)", "Regular expression containing regex-groups for label, name and data.", null);
    static CommandOption.Integer nameGroup = new CommandOption.Integer(BulkLoader.class, "name", "INTEGER", true, 1, "The index of the group containing the instance name.\n   Use 0 to indicate that this field is not used.", null);
    static CommandOption.Integer labelGroup = new CommandOption.Integer(BulkLoader.class, IOAAMetaDataConstants.ATTRIBUTE_OPTION_LABEL, "INTEGER", true, 2, "The index of the group containing the label string.\n   Use 0 to indicate that this field is not used.", null);
    static CommandOption.Integer dataGroup = new CommandOption.Integer(BulkLoader.class, "data", "INTEGER", true, 3, "The index of the group containing the data.", null);
    static CommandOption.Integer pruneCount = new CommandOption.Integer(BulkLoader.class, "prune-count", "N", false, 0, "Reduce features to those that occur more than N times.", null);
    static CommandOption.Double docProportionCutoff = new CommandOption.Double(BulkLoader.class, "prune-doc-frequency", "N", false, 1.0d, "Remove features that occur in more than (X*100)% of documents. 0.05 is equivalent to IDF of 3.0.", null);

    public static void generateStoplist(SimpleTokenizer simpleTokenizer) throws IOException {
        CsvIterator csvIterator = new CsvIterator(new FileReader(inputFile.value), lineRegex.value, dataGroup.value, labelGroup.value, nameGroup.value);
        ArrayList arrayList = new ArrayList();
        Alphabet alphabet = new Alphabet();
        CharSequenceLowercase charSequenceLowercase = new CharSequenceLowercase();
        SimpleTokenizer deepClone = simpleTokenizer.deepClone();
        StringList2FeatureSequence stringList2FeatureSequence = new StringList2FeatureSequence(alphabet);
        FeatureCountPipe featureCountPipe = new FeatureCountPipe(alphabet, null);
        FeatureDocFreqPipe featureDocFreqPipe = new FeatureDocFreqPipe(alphabet, null);
        if (!preserveCase.value) {
            arrayList.add(charSequenceLowercase);
        }
        arrayList.add(deepClone);
        arrayList.add(stringList2FeatureSequence);
        if (pruneCount.value > 0) {
            arrayList.add(featureCountPipe);
        }
        if (docProportionCutoff.value < 1.0d) {
            arrayList.add(featureDocFreqPipe);
        }
        Iterator<Instance> newIteratorFrom = new SerialPipes(arrayList).newIteratorFrom(csvIterator);
        int i = 0;
        while (newIteratorFrom.hasNext()) {
            i++;
            if (i % 100000 == 0) {
                System.out.println(i);
            }
            newIteratorFrom.next();
        }
        if (pruneCount.value > 0) {
            featureCountPipe.addPrunedWordsToStoplist(simpleTokenizer, pruneCount.value);
        }
        if (docProportionCutoff.value < 1.0d) {
            featureDocFreqPipe.addPrunedWordsToStoplist(simpleTokenizer, docProportionCutoff.value);
        }
    }

    public static void writeInstanceList(SimpleTokenizer simpleTokenizer) throws IOException {
        CsvIterator csvIterator = new CsvIterator(new FileReader(inputFile.value), lineRegex.value, dataGroup.value, labelGroup.value, nameGroup.value);
        ArrayList arrayList = new ArrayList();
        Alphabet alphabet = new Alphabet();
        CharSequenceLowercase charSequenceLowercase = new CharSequenceLowercase();
        StringList2FeatureSequence stringList2FeatureSequence = new StringList2FeatureSequence(alphabet);
        if (!preserveCase.value) {
            arrayList.add(charSequenceLowercase);
        }
        arrayList.add(simpleTokenizer);
        arrayList.add(stringList2FeatureSequence);
        InstanceList instanceList = new InstanceList(new SerialPipes(arrayList));
        instanceList.addThruPipe(csvIterator);
        instanceList.save(outputFile.value);
    }

    public static void main(String[] strArr) throws IOException {
        CommandOption.setSummary(BulkLoader.class, "Efficient tool for importing large amounts of text into Mallet format");
        CommandOption.process(BulkLoader.class, strArr);
        SimpleTokenizer simpleTokenizer = stoplistFile.value != null ? new SimpleTokenizer(stoplistFile.value) : removeStopWords.value ? new SimpleTokenizer(1) : new SimpleTokenizer(0);
        if (pruneCount.value > 0 || docProportionCutoff.value < 1.0d) {
            generateStoplist(simpleTokenizer);
        }
        writeInstanceList(simpleTokenizer);
    }
}
