package codemining.lm.ngram;

import codemining.languagetools.ITokenizer;
import codemining.lm.ITokenGeneratingLanguageModel;
import codemining.lm.ngram.Trie;
import codemining.util.SettingsLoader;
import codemining.util.serialization.ISerializationStrategy;
import codemining.util.serialization.Serializer;
import com.google.common.base.Preconditions;
import com.google.common.collect.ImmutableList;
import com.google.common.collect.Lists;
import com.google.common.collect.Multiset;
import com.google.common.collect.Sets;
import com.google.common.collect.TreeMultiset;
import com.google.common.math.DoubleMath;
import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.logging.Logger;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.filefilter.AbstractFileFilter;

/* JADX WARN: Classes with same name are omitted:
  input_file:lib/naturalize.jar:codemining/lm/ngram/AbstractNGramLM.class
 */
/* loaded from: input_file:naturalize.jar:codemining/lm/ngram/AbstractNGramLM.class */
public abstract class AbstractNGramLM implements ITokenGeneratingLanguageModel<ITokenizer.FullToken> {
    private static final long serialVersionUID = -5876426022517622146L;
    public static final String UNK_SYMBOL = "UNK_SYMBOL";
    public static final boolean DEBUG_PROBS = SettingsLoader.getBooleanSetting("uk.ac.ed.inf.ngram.debugProbs", false);
    private static final Logger LOGGER = Logger.getLogger(AbstractNGramLM.class.getName());
    private int nGramSize;
    protected ITokenizer tokenizer;
    protected LongTrie<String> trie;

    public static String getProbString(NGram<String> nGram, double d) {
        StringBuffer stringBuffer = new StringBuffer();
        stringBuffer.append("P(");
        stringBuffer.append(nGram.get(nGram.size() - 1));
        stringBuffer.append('|');
        stringBuffer.append(nGram.getPrefix());
        stringBuffer.append(")= ");
        stringBuffer.append(d);
        return stringBuffer.toString();
    }

    public static AbstractNGramLM readFromSerialized(String str) throws ISerializationStrategy.SerializationException {
        return (AbstractNGramLM) Serializer.getSerializer().deserializeFrom(str);
    }

    public static AbstractNGramLM readSerialized(String str) throws ISerializationStrategy.SerializationException {
        return (AbstractNGramLM) Serializer.getSerializer().deserializeFrom(str);
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public AbstractNGramLM() {
    }

    /* JADX INFO: Access modifiers changed from: protected */
    public AbstractNGramLM(AbstractNGramLM abstractNGramLM) {
        this.nGramSize = abstractNGramLM.nGramSize;
        this.trie = abstractNGramLM.trie;
        this.tokenizer = abstractNGramLM.getTokenizer();
    }

    public AbstractNGramLM(int i, ITokenizer iTokenizer) {
        this.nGramSize = i;
        this.trie = new LongTrie<>(UNK_SYMBOL);
        this.tokenizer = iTokenizer;
    }

    public abstract void addFromSentence(List<String> list, boolean z);

    protected abstract void addNgramToDict(NGram<String> nGram, boolean z);

    public abstract void addSentences(Set<List<String>> set, boolean z);

    public abstract void cutoffRare(int i);

    @Override // codemining.lm.ITokenGeneratingLanguageModel
    public List<ITokenizer.FullToken> generateSentence() {
        String str;
        ArrayList newArrayList = Lists.newArrayList();
        String str2 = ITokenizer.SENTENCE_START;
        while (true) {
            str = str2;
            if (str.equals(ITokenizer.SENTENCE_END)) {
                break;
            }
            newArrayList.add(str);
            str2 = pickRandom(NGram.constructNgramAt(newArrayList.size() - 1, newArrayList, getN() - 1));
        }
        newArrayList.add(str);
        ArrayList newArrayList2 = Lists.newArrayList();
        Iterator it = newArrayList.iterator();
        while (it.hasNext()) {
            newArrayList2.add(this.tokenizer.getTokenFromString((String) it.next()));
        }
        return newArrayList2;
    }

    @Override // codemining.lm.ILanguageModel
    public double getAbsoluteEntropy(File file) throws IOException {
        return getAbsoluteEntropy(FileUtils.readFileToString(file));
    }

    @Override // codemining.lm.ILanguageModel
    public double getAbsoluteEntropy(String str) {
        char[] charArray = str.toCharArray();
        if (charArray.length == 0) {
            return 0.0d;
        }
        ImmutableList copyOf = ImmutableList.copyOf((Collection) getTokenizer().tokenListFromCode(charArray));
        if (copyOf.isEmpty()) {
            return 0.0d;
        }
        return getLogProbOfSentence(copyOf);
    }

    public Multiset<String> getAlternativeNamings(Multiset<NGram<String>> multiset, String str) {
        TreeMultiset create = TreeMultiset.create();
        LongTrie<String> trie = getTrie();
        Iterator<Multiset.Entry<NGram<String>>> it = multiset.entrySet().iterator();
        while (it.hasNext()) {
            create.addAll((Set) Preconditions.checkNotNull(getAlternativesForNGram(trie, it.next().getElement(), str)));
        }
        return create;
    }

    public Set<String> getAlternativesForNGram(LongTrie<String> longTrie, NGram<String> nGram, String str) {
        NGram<String> nGram2;
        NGram<String> nGram3 = nGram;
        while (true) {
            nGram2 = nGram3;
            if (nGram2.get(nGram2.size() - 1).contains(str)) {
                break;
            }
            nGram3 = nGram2.getPrefix();
        }
        NGram<String> prefix = nGram2.getPrefix();
        Trie.TrieNode<Long> nGramNodeForInput = longTrie.getNGramNodeForInput(prefix, false);
        if (nGramNodeForInput == null) {
            return Collections.emptySet();
        }
        TreeSet newTreeSet = Sets.newTreeSet();
        if (prefix.size() != getN() - 1) {
            NGram nGram4 = new NGram(nGram, prefix.size() + 1, nGram.size());
            for (Map.Entry<Long, Trie.TrieNode<Long>> entry : nGramNodeForInput.prods.entrySet()) {
                String symbolFromKey = longTrie.getSymbolFromKey(entry.getKey());
                if (longTrie.getNGramNodeForInput(NGram.substituteTokenWith(nGram4, str, symbolFromKey), false, entry.getValue()) != null) {
                    newTreeSet.add(symbolFromKey);
                }
            }
        } else {
            Iterator<Long> it = nGramNodeForInput.prods.keySet().iterator();
            while (it.hasNext()) {
                newTreeSet.add(longTrie.getSymbolFromKey(it.next()));
            }
        }
        return newTreeSet;
    }

    @Override // codemining.lm.ILanguageModel
    public double getExtrinsticEntropy(File file) throws IOException {
        return getExtrinsticEntropy(FileUtils.readFileToString(file));
    }

    @Override // codemining.lm.ILanguageModel
    public double getExtrinsticEntropy(String str) {
        char[] charArray = str.toCharArray();
        if (charArray.length == 0) {
            return 0.0d;
        }
        ImmutableList copyOf = ImmutableList.copyOf((Collection) getTokenizer().tokenListFromCode(charArray));
        if (copyOf.isEmpty()) {
            return 0.0d;
        }
        return getLogProbOfSentence(copyOf) / (copyOf.size() - 1.0d);
    }

    public double getLogProbOfSentence(List<String> list) {
        double d = 0.0d;
        for (int i = 0; i < list.size(); i++) {
            NGram<String> constructNgramAt = NGram.constructNgramAt(i, list, this.nGramSize);
            if (constructNgramAt.size() > 1) {
                double probabilityFor = getProbabilityFor(constructNgramAt);
                if (DEBUG_PROBS) {
                    LOGGER.info(getProbString(this.trie.substituteWordsToUNK(constructNgramAt), probabilityFor));
                }
                Preconditions.checkArgument(probabilityFor > 0.0d);
                Preconditions.checkArgument(!Double.isInfinite(probabilityFor));
                d += DoubleMath.log2(probabilityFor);
            }
        }
        return d;
    }

    public double getMLProbabilityFor(NGram<String> nGram, boolean z) {
        long count = this.trie.getCount(nGram, z, true);
        long count2 = this.trie.getCount(nGram.getPrefix(), z, false);
        if (count2 == 0) {
            return 0.0d;
        }
        Preconditions.checkArgument(count <= count2);
        return count / count2;
    }

    public final int getN() {
        return this.nGramSize;
    }

    public abstract double getProbabilityFor(NGram<String> nGram);

    @Override // codemining.lm.ITokenGeneratingLanguageModel
    public ITokenizer getTokenizer() {
        return this.tokenizer;
    }

    public final LongTrie<String> getTrie() {
        return this.trie;
    }

    public final Trie.TrieNode<Long> getTrieRoot() {
        return this.trie.getRoot();
    }

    public final Long getUNKSymbolId() {
        return this.trie.getUnkSymbolId();
    }

    @Override // codemining.lm.ILanguageModel
    public AbstractFileFilter modelledFilesFilter() {
        return getTokenizer().getFileFilter();
    }

    public String pickRandom(NGram<String> nGram) {
        Map<String, Long> possibleProductionsWithCounts = this.trie.getPossibleProductionsWithCounts(nGram);
        if (possibleProductionsWithCounts.size() == 0) {
            return pickRandom(nGram.getSuffix());
        }
        double d = 0.0d;
        while (possibleProductionsWithCounts.entrySet().iterator().hasNext()) {
            d += r0.next().getValue().longValue();
        }
        double random = Math.random() * d;
        long j = 0;
        for (Map.Entry<String, Long> entry : possibleProductionsWithCounts.entrySet()) {
            j += entry.getValue().longValue();
            if (j >= random) {
                return entry.getKey();
            }
        }
        return (String) Preconditions.checkNotNull(null, "Should never reach this point. Picking random production failed.");
    }

    public void serializeToDisk(String str) throws ISerializationStrategy.SerializationException {
        Serializer.getSerializer().serialize(this, str);
    }

    public String toString() {
        return this.trie.toString();
    }
}
