package edu.stanford.nlp.sequences;

import edu.stanford.nlp.io.IOUtils;
import edu.stanford.nlp.io.RuntimeIOException;
import edu.stanford.nlp.ling.CoreAnnotations;
import edu.stanford.nlp.ling.CoreLabel;
import edu.stanford.nlp.objectbank.ObjectBank;
import edu.stanford.nlp.stats.ClassicCounter;
import edu.stanford.nlp.stats.Counter;
import edu.stanford.nlp.stats.Counters;
import edu.stanford.nlp.util.AbstractIterator;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Reader;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/* loaded from: input_file:edu/stanford/nlp/sequences/CoNLLDocumentReaderAndWriter.class */
public class CoNLLDocumentReaderAndWriter implements DocumentReaderAndWriter<CoreLabel> {
    private static final long serialVersionUID = 6281374154299530460L;
    public static final String BOUNDARY = "*BOUNDARY*";
    private static final boolean TREAT_FILE_AS_ONE_DOCUMENT = false;
    private static final Pattern docPattern = Pattern.compile("^\\s*-DOCSTART-\\s");
    private static final Pattern white = Pattern.compile("^\\s*$");
    private SeqClassifierFlags flags;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:edu/stanford/nlp/sequences/CoNLLDocumentReaderAndWriter$CoNLLIterator.class */
    public class CoNLLIterator extends AbstractIterator<List<CoreLabel>> {
        private Iterator<String> stringIter;

        public CoNLLIterator(Reader reader) {
            this.stringIter = CoNLLDocumentReaderAndWriter.splitIntoDocs(reader);
        }

        @Override // edu.stanford.nlp.util.AbstractIterator, java.util.Iterator
        public boolean hasNext() {
            return this.stringIter.hasNext();
        }

        @Override // edu.stanford.nlp.util.AbstractIterator, java.util.Iterator
        public List<CoreLabel> next() {
            return CoNLLDocumentReaderAndWriter.this.processDocument(this.stringIter.next());
        }
    }

    @Override // edu.stanford.nlp.sequences.DocumentReaderAndWriter
    public void init(SeqClassifierFlags seqClassifierFlags) {
        this.flags = seqClassifierFlags;
    }

    public String toString() {
        return "CoNLLDocumentReaderAndWriter[entitySubclassification: " + this.flags.entitySubclassification + ", intern: " + this.flags.intern + ']';
    }

    @Override // edu.stanford.nlp.objectbank.IteratorFromReaderFactory
    public Iterator<List<CoreLabel>> getIterator(Reader reader) {
        return new CoNLLIterator(reader);
    }

    /* JADX INFO: Access modifiers changed from: private */
    public static Iterator<String> splitIntoDocs(Reader reader) {
        ArrayList arrayList = new ArrayList();
        ObjectBank<String> lineIterator = ObjectBank.getLineIterator(reader);
        StringBuilder sb = new StringBuilder();
        Matcher matcher = docPattern.matcher("");
        Iterator<String> it2 = lineIterator.iterator();
        while (it2.hasNext()) {
            String next = it2.next();
            if (matcher.reset(next).lookingAt() && sb.length() > 0) {
                arrayList.add(sb.toString());
                sb.setLength(0);
            }
            sb.append(next).append('\n');
        }
        if (sb.length() > 0) {
            arrayList.add(sb.toString());
        }
        return arrayList.iterator();
    }

    /* JADX INFO: Access modifiers changed from: private */
    public List<CoreLabel> processDocument(String str) {
        ArrayList arrayList = new ArrayList();
        for (String str2 : str.split("\n")) {
            if (!this.flags.deleteBlankLines || !white.matcher(str2).matches()) {
                arrayList.add(makeCoreLabel(str2));
            }
        }
        IOBUtils.entitySubclassify(arrayList, CoreAnnotations.AnswerAnnotation.class, this.flags.backgroundSymbol, this.flags.entitySubclassification, this.flags.intern);
        return arrayList;
    }

    private CoreLabel makeCoreLabel(String str) {
        CoreLabel coreLabel = new CoreLabel();
        String[] split = str.split("\\s+");
        switch (split.length) {
            case 0:
            case 1:
                coreLabel.setWord(BOUNDARY);
                coreLabel.set(CoreAnnotations.AnswerAnnotation.class, this.flags.backgroundSymbol);
                break;
            case 2:
                coreLabel.setWord(split[0]);
                coreLabel.set(CoreAnnotations.AnswerAnnotation.class, split[1]);
                break;
            case 3:
                coreLabel.setWord(split[0]);
                coreLabel.setTag(split[1]);
                coreLabel.set(CoreAnnotations.AnswerAnnotation.class, split[2]);
                break;
            case 4:
                coreLabel.setWord(split[0]);
                coreLabel.setTag(split[1]);
                coreLabel.set(CoreAnnotations.ChunkAnnotation.class, split[2]);
                coreLabel.set(CoreAnnotations.AnswerAnnotation.class, split[3]);
                break;
            case 5:
                if (this.flags.useLemmaAsWord) {
                    coreLabel.setWord(split[1]);
                } else {
                    coreLabel.setWord(split[0]);
                }
                coreLabel.set(CoreAnnotations.LemmaAnnotation.class, split[1]);
                coreLabel.setTag(split[2]);
                coreLabel.set(CoreAnnotations.ChunkAnnotation.class, split[3]);
                coreLabel.set(CoreAnnotations.AnswerAnnotation.class, split[4]);
                break;
            default:
                throw new RuntimeIOException("Unexpected input (many fields): " + str);
        }
        coreLabel.set(CoreAnnotations.ValueAnnotation.class, coreLabel.word());
        coreLabel.set(CoreAnnotations.GoldAnswerAnnotation.class, coreLabel.get(CoreAnnotations.AnswerAnnotation.class));
        return coreLabel;
    }

    private void deEndify(List<CoreLabel> list) {
        if (this.flags.retainEntitySubclassification) {
            return;
        }
        IOBUtils.entitySubclassify(list, CoreAnnotations.AnswerAnnotation.class, this.flags.backgroundSymbol, "iob1", this.flags.intern);
    }

    @Override // edu.stanford.nlp.sequences.DocumentReaderAndWriter
    public void printAnswers(List<CoreLabel> list, PrintWriter printWriter) {
        if (!"iob1".equalsIgnoreCase(this.flags.entitySubclassification)) {
            deEndify(list);
        }
        for (CoreLabel coreLabel : list) {
            if (coreLabel.word() == BOUNDARY) {
                printWriter.println();
            } else {
                String string = coreLabel.getString(CoreAnnotations.GoldAnswerAnnotation.class);
                String str = (String) coreLabel.get(CoreAnnotations.AnswerAnnotation.class);
                printWriter.println(coreLabel.word() + '\t' + coreLabel.getString(CoreAnnotations.PartOfSpeechAnnotation.class) + '\t' + coreLabel.getString(CoreAnnotations.ChunkAnnotation.class) + '\t' + string + '\t' + str);
            }
        }
    }

    private static StringBuilder maybeIncrementCounter(StringBuilder sb, Counter<String> counter) {
        if (sb.length() > 0) {
            counter.incrementCount(sb.toString());
            sb = new StringBuilder();
        }
        return sb;
    }

    public static void main(String[] strArr) throws IOException, ClassNotFoundException {
        String str;
        String str2;
        CoNLLDocumentReaderAndWriter coNLLDocumentReaderAndWriter = new CoNLLDocumentReaderAndWriter();
        coNLLDocumentReaderAndWriter.init(new SeqClassifierFlags());
        int i = 0;
        int i2 = 0;
        int i3 = 0;
        String str3 = "";
        ClassicCounter classicCounter = new ClassicCounter();
        StringBuilder sb = new StringBuilder();
        Iterator<List<CoreLabel>> iterator = coNLLDocumentReaderAndWriter.getIterator(IOUtils.readerFromString(strArr[0]));
        while (iterator.hasNext()) {
            i++;
            for (CoreLabel coreLabel : iterator.next()) {
                String word = coreLabel.word();
                if (!word.equals(BOUNDARY)) {
                    String[] split = ((String) coreLabel.get(CoreAnnotations.AnswerAnnotation.class)).split("-");
                    if (split.length == 1) {
                        str = split[0];
                        str2 = "";
                    } else {
                        str = split[1];
                        str2 = split[0];
                    }
                    i2++;
                    if (str.equals(SeqClassifierFlags.DEFAULT_BACKGROUND_SYMBOL)) {
                        sb = maybeIncrementCounter(sb, classicCounter);
                    } else {
                        if (!str.equals(str3)) {
                            i3++;
                            sb = maybeIncrementCounter(sb, classicCounter);
                        } else if (str2.equals("B")) {
                            i3++;
                            sb = maybeIncrementCounter(sb, classicCounter);
                        }
                        if (str.equals("MISC")) {
                            if (sb.length() > 0) {
                                sb.append(' ');
                            }
                            sb.append(word);
                        }
                    }
                    str3 = str;
                }
            }
        }
        System.out.println("File " + strArr[0] + " has " + i + " documents, " + i2 + " (non-blank line) tokens and " + i3 + " entities.");
        System.out.printf("Here are the %.0f MISC items with counts:%n", Double.valueOf(classicCounter.totalCount()));
        System.out.println(Counters.toVerticalString(classicCounter, "%.0f\t%s"));
    }
}
