package codemining.lm.sequencememoizer;

import codemining.languagetools.ITokenizer;
import codemining.lm.ILanguageModel;
import codemining.lm.ITokenGeneratingLanguageModel;
import codemining.lm.util.TokenVocabularyBuilder;
import codemining.lm.util.VocabularyToInt;
import codemining.util.SettingsLoader;
import codemining.util.serialization.ISerializationStrategy;
import codemining.util.serialization.Serializer;
import com.esotericsoftware.kryo.DefaultSerializer;
import com.esotericsoftware.kryo.serializers.JavaSerializer;
import com.google.common.collect.BiMap;
import com.google.common.collect.Lists;
import edu.columbia.stat.wood.pub.sequencememoizer.IntSequenceMemoizer;
import edu.columbia.stat.wood.pub.sequencememoizer.IntSequenceMemoizerParameters;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.List;
import java.util.Set;
import java.util.logging.Logger;
import org.apache.commons.io.filefilter.AbstractFileFilter;
import org.apache.commons.lang.exception.ExceptionUtils;
import org.apache.commons.lang.math.RandomUtils;

@DefaultSerializer(JavaSerializer.class)
/* loaded from: input_file:codemining/lm/sequencememoizer/SequenceMemoizerLM.class */
public class SequenceMemoizerLM implements ITokenGeneratingLanguageModel<ITokenizer.FullToken> {
    private IntSequenceMemoizer memoizer;
    private final ITokenizer tokenizer;
    private static final long serialVersionUID = 8676708745889523608L;
    VocabularyToInt vocabularyMapper;
    private static final Logger LOGGER = Logger.getLogger(SequenceMemoizerLM.class.getName());
    public static final int CLEAN_VOCABULARY_THRESHOLD = (int) SettingsLoader.getNumericSetting("CleanVocabularyCountThreshold", 20.0d);
    private static final int NUM_ITERATIONS = (int) SettingsLoader.getNumericSetting("iterations", 100.0d);
    private static final double SAMPLE_PERCENTAGE = SettingsLoader.getNumericSetting("sample", 1.0d);

    public SequenceMemoizerLM(ITokenizer iTokenizer) {
        this.tokenizer = iTokenizer;
    }

    @Override // codemining.lm.ITokenGeneratingLanguageModel
    public List<ITokenizer.FullToken> generateSentence() {
        ArrayList newArrayList = Lists.newArrayList();
        int[] iArr = {this.vocabularyMapper.getAlphabet().get(ITokenizer.SENTENCE_START).intValue()};
        newArrayList.add(new ITokenizer.FullToken(ITokenizer.SENTENCE_START, ""));
        int[] generateSequence = this.memoizer.generateSequence(iArr, 10000);
        BiMap<Integer, String> inverse = this.vocabularyMapper.getAlphabet().inverse();
        for (int i : generateSequence) {
            String str = inverse.get(Integer.valueOf(i));
            newArrayList.add(new ITokenizer.FullToken(str, ""));
            if (str.equals(ITokenizer.SENTENCE_END)) {
                break;
            }
        }
        return newArrayList;
    }

    @Override // codemining.lm.ILanguageModel
    public double getAbsoluteEntropy(File file) throws IOException {
        return this.memoizer.sequenceProbability(new int[0], this.vocabularyMapper.fileToIntSequence(file));
    }

    @Override // codemining.lm.ILanguageModel
    public double getAbsoluteEntropy(String str) {
        return this.memoizer.sequenceProbability(new int[0], this.vocabularyMapper.codeToIntSequence(str));
    }

    @Override // codemining.lm.ILanguageModel
    public double getExtrinsticEntropy(File file) throws IOException {
        return this.memoizer.sequenceProbability(new int[0], this.vocabularyMapper.fileToIntSequence(file)) / r0.length;
    }

    @Override // codemining.lm.ILanguageModel
    public double getExtrinsticEntropy(String str) {
        return this.memoizer.sequenceProbability(new int[0], this.vocabularyMapper.codeToIntSequence(str)) / r0.length;
    }

    @Override // codemining.lm.ILanguageModel
    public ILanguageModel getImmutableVersion() {
        return this;
    }

    @Override // codemining.lm.ITokenGeneratingLanguageModel
    public ITokenizer getTokenizer() {
        return this.tokenizer;
    }

    @Override // codemining.lm.ILanguageModel
    public AbstractFileFilter modelledFilesFilter() {
        return this.tokenizer.getFileFilter();
    }

    public void serializeToDisk(String str) throws ISerializationStrategy.SerializationException {
        Serializer.getSerializer().serialize(this, str);
    }

    @Override // codemining.lm.ILanguageModel
    public void trainIncrementalModel(Collection<File> collection) throws IOException {
        throw new UnsupportedOperationException();
    }

    @Override // codemining.lm.ILanguageModel
    public void trainModel(Collection<File> collection) throws IOException {
        Set<String> buildVocabulary = TokenVocabularyBuilder.buildVocabulary(collection, this.tokenizer, CLEAN_VOCABULARY_THRESHOLD);
        this.vocabularyMapper = new VocabularyToInt(this.tokenizer, buildVocabulary);
        LOGGER.info("Vocabulary Built. Reading files...");
        this.memoizer = new IntSequenceMemoizer(new IntSequenceMemoizerParameters(buildVocabulary.size()));
        for (File file : collection) {
            try {
                if (RandomUtils.nextDouble() <= SAMPLE_PERCENTAGE) {
                    this.memoizer.newSequence();
                    this.memoizer.continueSequence(this.vocabularyMapper.fileToIntSequence(file));
                }
            } catch (Throwable th) {
                LOGGER.warning("Failed to add file " + file.getAbsolutePath() + " " + ExceptionUtils.getFullStackTrace(th));
            }
        }
        LOGGER.info("Sequences stored. Stating " + NUM_ITERATIONS + " iterations");
        this.memoizer.sample(NUM_ITERATIONS);
    }
}
