package de.tuberlin.cis.bilke.dumas.duplicate;

import de.tuberlin.cis.bilke.dumas.Dumas;
import de.tuberlin.cis.bilke.dumas.DumasException;
import de.tuberlin.cis.bilke.dumas.datastructures.Alignment;
import de.tuberlin.cis.bilke.dumas.datastructures.MatchScore;
import de.tuberlin.cis.bilke.dumas.db.RecordId;
import de.tuberlin.cis.bilke.dumas.db.RecordIterator;
import de.tuberlin.cis.bilke.dumas.db.Table;
import de.tuberlin.cis.bilke.dumas.index.InvertedIndex;
import de.tuberlin.cis.bilke.dumas.string.BagOfTokens;
import de.tuberlin.cis.bilke.dumas.string.EditDistance;
import de.tuberlin.cis.bilke.dumas.string.StringDistance;
import de.tuberlin.cis.bilke.dumas.string.StringWrapper;
import de.tuberlin.cis.bilke.dumas.string.tokens.SimpleTokenizer;
import de.tuberlin.cis.bilke.dumas.string.tokens.Token;
import java.util.Collection;
import java.util.Iterator;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;

/* loaded from: input_file:de/tuberlin/cis/bilke/dumas/duplicate/TFIDFMatch.class */
public class TFIDFMatch extends AbstractRecordMatch {
    protected StringCompare _fieldCompare;
    protected FieldMatch _fieldMatch;
    private PriorityQueue _queue;
    private boolean _collectStatistics;
    private static final int MINNUMDUPLICATES = 3;
    private static final int NUMDUPLICATES = 10;
    private static final double DUPTHRESHOLD = 1.0d;
    private int _wantedDuplicates;
    public static final StringCompare DEFAULT_COMPARE = new TFIDFCompare();
    public static final StringCompare DEFAULT_FIELDCOMPARE = new SoftTFIDFCompare(SimpleTokenizer.DEFAULT_TOKENIZER, new EditDistance(), 0.5d);

    public TFIDFMatch(boolean z) {
        super(DEFAULT_COMPARE);
        this._fieldCompare = null;
        this._fieldMatch = null;
        this._queue = null;
        this._collectStatistics = true;
        this._wantedDuplicates = -1;
        this._fieldCompare = DEFAULT_FIELDCOMPARE;
        this._fieldMatch = new FieldMatch(this._fieldCompare);
    }

    @Override // de.tuberlin.cis.bilke.dumas.duplicate.RecordMatch
    public void load(Table table, Table table2) {
        load(table, table.recordIterator(), table2, table2.recordIterator());
    }

    public void load(Table table, RecordIterator recordIterator, Table table2, RecordIterator recordIterator2) {
        Logger logger = Dumas.getLogger();
        String name = getClass().getName();
        logger.logp(Level.FINE, name, "load", "Start accumulating statistics.");
        long currentTimeMillis = System.currentTimeMillis();
        loadSource(table, recordIterator);
        logger.logp(Level.FINE, name, "load", "Statistics for first database collected.");
        loadTarget(table2, recordIterator2);
        logger.logp(Level.FINE, name, "load", "Statistics for second databases collected.");
        logger.logp(Level.FINE, name, "load", "Finished accumulating statistics in " + (System.currentTimeMillis() - currentTimeMillis) + " ms.");
    }

    public void loadSource(Table table, RecordIterator recordIterator) {
        if (this._collectStatistics) {
            this._compare.preprocessSource(recordIterator);
        }
        this._fieldCompare.useStatisticsForSource(this._compare.getSourceStatistics());
        setSourceWrapper(new TableWrapper(this._compare.getSourceDistance(), this._fieldCompare.getSourceDistance(), table));
    }

    public void loadTarget(Table table, RecordIterator recordIterator) {
        if (this._collectStatistics) {
            this._compare.preprocessTarget(recordIterator);
        }
        this._fieldCompare.useStatisticsForTarget(this._compare.getTargetStatistics());
        setTargetWrapper(new TableWrapper(this._compare.getTargetDistance(), this._fieldCompare.getTargetDistance(), table));
    }

    public Collection match() {
        return match(null, null, null);
    }

    @Override // de.tuberlin.cis.bilke.dumas.duplicate.RecordMatch
    public Collection match(Collection collection, Collection collection2, Alignment alignment) {
        Logger logger = Dumas.getLogger();
        String name = getClass().getName();
        TableWrapper sourceWrapper = getSourceWrapper();
        TableWrapper targetWrapper = getTargetWrapper();
        int numRecords = sourceWrapper.numRecords();
        if (numRecords == 0) {
            throw new DumasException("Source database does not contain any tuples.");
        }
        int numRecords2 = targetWrapper.numRecords();
        if (numRecords2 == 0) {
            throw new DumasException("Target database does not contain any tuples.");
        }
        logger.logp(Level.FINE, name, "match", "Preparing wrappers.");
        sourceWrapper.prepareWrappers(collection);
        targetWrapper.prepareWrappers(collection2);
        if (getWantedDuplicates() <= 0) {
            setWantedDuplicates(Math.max(1 + Math.min(Math.min(numRecords, numRecords2) / NUMDUPLICATES, 9), MINNUMDUPLICATES));
        }
        if (alignment != null) {
            throw new DumasException("TFIDFMatch does not support known alignment.");
        }
        logger.logp(Level.FINE, name, "match", "Start the matching process.");
        return matchPrepared(sourceWrapper, targetWrapper);
    }

    public Collection matchPrepared(TableWrapper tableWrapper, TableWrapper tableWrapper2) {
        Logger logger = Dumas.getLogger();
        String name = getClass().getName();
        this._queue = new PriorityQueue(MINNUMDUPLICATES * tableWrapper.size());
        logger.logp(Level.FINE, name, "matchPrepared", "Creating index for second database.");
        return whirlSearch(tableWrapper, tableWrapper2, createInvertedIndex(tableWrapper2), this._queue, this._compare);
    }

    private InvertedIndex createInvertedIndex(TableWrapper tableWrapper) {
        InvertedIndex invertedIndex = new InvertedIndex();
        Iterator recordWrapperIterator = tableWrapper.recordWrapperIterator();
        while (recordWrapperIterator.hasNext()) {
            RecordWrapper recordWrapper = (RecordWrapper) recordWrapperIterator.next();
            BagOfTokens bagOfTokens = (BagOfTokens) recordWrapper.getStringWrapper();
            Iterator it = bagOfTokens.tokenIterator();
            while (it.hasNext()) {
                Token token = (Token) it.next();
                invertedIndex.addPosting(token, recordWrapper.getRecordId(), bagOfTokens.getWeight(token));
            }
        }
        return invertedIndex;
    }

    private Collection whirlSearch(TableWrapper tableWrapper, TableWrapper tableWrapper2, InvertedIndex invertedIndex, PriorityQueue priorityQueue, StringCompare stringCompare) {
        MatchScoreList matchScoreList = new MatchScoreList(getWantedDuplicates());
        Logger logger = Dumas.getLogger();
        String name = getClass().getName();
        logger.logp(Level.FINE, name, "whirlSearch", "Start comparing records.");
        long currentTimeMillis = System.currentTimeMillis();
        long j = 0;
        Iterator recordWrapperIterator = tableWrapper.recordWrapperIterator();
        while (recordWrapperIterator.hasNext()) {
            double d = 0.0d;
            RecordWrapper recordWrapper = (RecordWrapper) recordWrapperIterator.next();
            BagOfTokens bagOfTokens = (BagOfTokens) recordWrapper.getStringWrapper();
            Iterator it = bagOfTokens.tokenIterator();
            while (it.hasNext()) {
                Token token = (Token) it.next();
                d += bagOfTokens.getWeight(token) * invertedIndex.getMaxWeight(token);
            }
            priorityQueue.insert(new SearchState(recordWrapper.getRecordId(), d, bagOfTokens.sortedTokenList()));
        }
        GoalStateList goalStateList = new GoalStateList(getWantedDuplicates());
        int i = 0;
        while (i < getWantedDuplicates() && !priorityQueue.isEmpty()) {
            SearchState searchState = (SearchState) priorityQueue.extractMax();
            RecordId source = searchState.getSource();
            RecordWrapper recordWrapper2 = tableWrapper.getRecordWrapper(source);
            BagOfTokens bagOfTokens2 = (BagOfTokens) recordWrapper2.getStringWrapper();
            if (searchState.getTarget() != null) {
                RecordWrapper recordWrapper3 = tableWrapper2.getRecordWrapper(searchState.getTarget());
                if (!matchScoreList.hasSourceRecord(searchState.getSource()) && !matchScoreList.hasTargetRecord(searchState.getTarget())) {
                    MatchScore matchScore = new MatchScore(searchState.getSource(), searchState.getTarget());
                    matchScore.setScore(searchState.getValue().doubleValue());
                    matchScore.setSourceWrapper(recordWrapper2);
                    matchScore.setTargetWrapper(recordWrapper3);
                    matchScoreList.add(matchScore);
                    i++;
                }
            } else {
                Token removeFirstToken = searchState.removeFirstToken();
                if (removeFirstToken != null) {
                    Token[] exclusions = searchState.exclusions();
                    List postings = invertedIndex.getPostings(removeFirstToken);
                    RecordId[] recordIdArr = new RecordId[postings.size()];
                    for (int i2 = 0; i2 < postings.size(); i2++) {
                        recordIdArr[i2] = (RecordId) postings.get(i2);
                    }
                    for (RecordWrapper recordWrapper4 : tableWrapper2.getRecordWrappers(recordIdArr)) {
                        BagOfTokens bagOfTokens3 = (BagOfTokens) recordWrapper4.getStringWrapper();
                        boolean z = false;
                        for (Token token2 : exclusions) {
                            if (bagOfTokens3.contains(token2)) {
                                z = true;
                            }
                        }
                        if (!z) {
                            double score = stringCompare.score(recordWrapper2.getStringWrapper(), recordWrapper4.getStringWrapper());
                            j++;
                            if (goalStateList.couldInsert(score) && i < getWantedDuplicates()) {
                                SearchState searchState2 = new SearchState(source, score, null);
                                searchState2.setTarget(recordWrapper4.getRecordId());
                                goalStateList.insert(searchState2);
                                if (score <= DUPTHRESHOLD) {
                                    priorityQueue.insert(searchState2);
                                } else if (!matchScoreList.hasSourceRecord(source) && !matchScoreList.hasTargetRecord(recordWrapper4.getRecordId())) {
                                    MatchScore matchScore2 = new MatchScore(source, recordWrapper4.getRecordId());
                                    matchScore2.setScore(score);
                                    matchScore2.setSourceWrapper(recordWrapper2);
                                    matchScore2.setTargetWrapper(recordWrapper4);
                                    matchScoreList.add(matchScore2);
                                    i++;
                                }
                            }
                        }
                    }
                    searchState.addExclusion(removeFirstToken);
                    double d2 = 0.0d;
                    for (Token token3 : searchState.getTokens()) {
                        d2 += bagOfTokens2.getWeight(token3) * invertedIndex.getMaxWeight(token3);
                    }
                    if (goalStateList.couldInsert(d2)) {
                        searchState.setValue(d2);
                        priorityQueue.insert(searchState);
                    }
                }
            }
        }
        logger.logp(Level.FINE, name, "whirlSearch", "> > > Finished comparing records in " + (System.currentTimeMillis() - currentTimeMillis) + " ms.");
        logger.logp(Level.FINE, name, "whirlSearch", "Possible pairs: " + (tableWrapper.size() * tableWrapper2.size()));
        logger.logp(Level.FINE, name, "whirlSearch", "Number of comparisons: " + j);
        return matchScoreList.getArrayList();
    }

    @Override // de.tuberlin.cis.bilke.dumas.duplicate.AbstractRecordMatch
    public MatchScore score(RecordWrapper recordWrapper, RecordWrapper recordWrapper2, Alignment alignment, boolean z) {
        if (alignment != null) {
            throw new DumasException("TFIDFMatch does not support known alignment.");
        }
        StringWrapper stringWrapper = recordWrapper.getStringWrapper();
        StringWrapper stringWrapper2 = recordWrapper2.getStringWrapper();
        MatchScore matchScore = new MatchScore(recordWrapper.getRecordId(), recordWrapper2.getRecordId());
        matchScore.setSourceWrapper(recordWrapper);
        matchScore.setTargetWrapper(recordWrapper2);
        if (alignment == null) {
            matchScore.setScore(this._compare.score(stringWrapper, stringWrapper2));
        } else {
            int i = 0;
            double d = 0.0d;
            for (int i2 = 1; i2 <= recordWrapper.numValues(); i2++) {
                if (alignment.hasSourceAlignment(i2)) {
                    i++;
                    d += this._fieldCompare.score(recordWrapper.getFieldWrapper(i2), recordWrapper2.getFieldWrapper(alignment.getSourceAlignment(i2).intValue()));
                }
            }
            matchScore.setScore((d + this._compare.score(stringWrapper, stringWrapper2)) / (i + 1));
        }
        if (z) {
            matchScore.setMatrix(this._fieldMatch.compareFields(recordWrapper, recordWrapper2, alignment));
        }
        return matchScore;
    }

    public void setWantedDuplicates(int i) {
        this._wantedDuplicates = i;
    }

    public int getWantedDuplicates() {
        return this._wantedDuplicates;
    }

    public StringDistance getFieldDistance() {
        return this._fieldCompare.getSourceDistance();
    }
}
