package de.tuberlin.cis.bilke.dumas.string;

import de.tuberlin.cis.bilke.dumas.string.tokens.SimpleTokenizer;
import de.tuberlin.cis.bilke.dumas.string.tokens.Token;
import de.tuberlin.cis.bilke.dumas.string.tokens.Tokenizer;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;

/* loaded from: input_file:de/tuberlin/cis/bilke/dumas/string/AbstractStatisticalTokenDistance.class */
public abstract class AbstractStatisticalTokenDistance extends AbstractStringDistance {
    protected transient Tokenizer _tokenizer;
    protected DocumentFrequency _documentFrequency;
    private static final Integer ONE = new Integer(1);
    private static final Integer TWO = new Integer(2);
    private static final Integer THREE = new Integer(3);
    protected int _collectionSize;
    protected int _totalTokenCount;
    private transient HashSet _seenTokens;

    /* loaded from: input_file:de/tuberlin/cis/bilke/dumas/string/AbstractStatisticalTokenDistance$MemoryDocumentFrequency.class */
    public class MemoryDocumentFrequency implements DocumentFrequency {
        private HashMap _dfMap = new HashMap();

        public MemoryDocumentFrequency() {
        }

        @Override // de.tuberlin.cis.bilke.dumas.string.DocumentFrequency
        public int getDocumentFrequency(Token token) {
            Integer num = (Integer) this._dfMap.get(token);
            if (num == null) {
                return 0;
            }
            return num.intValue();
        }

        public void incrementDocumentFrequency(Token token) {
            Integer num = (Integer) this._dfMap.get(token);
            if (num == null) {
                this._dfMap.put(token, AbstractStatisticalTokenDistance.ONE);
                return;
            }
            if (num == AbstractStatisticalTokenDistance.ONE) {
                this._dfMap.put(token, AbstractStatisticalTokenDistance.TWO);
            } else if (num == AbstractStatisticalTokenDistance.TWO) {
                this._dfMap.put(token, AbstractStatisticalTokenDistance.THREE);
            } else {
                this._dfMap.put(token, new Integer(num.intValue() + 1));
            }
        }

        @Override // de.tuberlin.cis.bilke.dumas.string.DocumentFrequency
        public void incrementCachedDocumentFrequency(Token token) {
            incrementDocumentFrequency(token);
        }

        @Override // de.tuberlin.cis.bilke.dumas.string.DocumentFrequency
        public void finalize() {
        }
    }

    public AbstractStatisticalTokenDistance(Tokenizer tokenizer) {
        this._documentFrequency = new MemoryDocumentFrequency();
        this._collectionSize = 0;
        this._totalTokenCount = 0;
        this._seenTokens = new HashSet();
        this._tokenizer = tokenizer;
    }

    public AbstractStatisticalTokenDistance() {
        this(SimpleTokenizer.DEFAULT_TOKENIZER);
    }

    @Override // de.tuberlin.cis.bilke.dumas.string.StringDistance
    public void accumulateStringArrayStatistics(Iterator it) {
        while (it.hasNext()) {
            addToStatistics(this._tokenizer.tokenizeTemp((String[]) it.next()));
        }
        this._documentFrequency.finalize();
    }

    private void addToStatistics(Token[] tokenArr) {
        this._seenTokens.clear();
        for (int i = 0; i < tokenArr.length; i++) {
            this._totalTokenCount++;
            if (!this._seenTokens.contains(tokenArr[i])) {
                this._seenTokens.add(tokenArr[i]);
                this._documentFrequency.incrementCachedDocumentFrequency(tokenArr[i]);
            }
        }
        this._collectionSize++;
    }

    public int getDocumentFrequency(Token token) {
        return this._documentFrequency.getDocumentFrequency(token);
    }

    public void setStatistics(DocumentFrequency documentFrequency) {
        this._documentFrequency = documentFrequency;
    }

    @Override // de.tuberlin.cis.bilke.dumas.string.StringDistance
    public DocumentFrequency getStatistics() {
        return this._documentFrequency;
    }

    public void setTokenizer(Tokenizer tokenizer) {
        this._tokenizer = tokenizer;
    }
}
