package org.apache.tika.eval.app.tools;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.LinkOption;
import java.nio.file.OpenOption;
import java.nio.file.Path;
import java.nio.file.Paths;
import java.nio.file.attribute.FileAttribute;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Objects;
import java.util.Set;
import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream;
import org.apache.commons.io.FileUtils;
import org.apache.commons.lang3.StringUtils;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.TextField;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.PriorityQueue;
import org.apache.tika.eval.core.tokens.AnalyzerManager;
import org.apache.tika.eval.core.tokens.URLEmailNormalizingFilterFactory;
import org.apache.tika.utils.ProcessUtils;

/* loaded from: input_file:org/apache/tika/eval/app/tools/TopCommonTokenCounter.class */
public class TopCommonTokenCounter {
    private static final String FIELD = "f";
    static Set<String> INCLUDE_LIST = new HashSet(Arrays.asList(URLEmailNormalizingFilterFactory.URL, URLEmailNormalizingFilterFactory.EMAIL));
    static Set<String> SKIP_LIST = new HashSet(Arrays.asList("span", "table", "href", "head", "title", "body", "html", "tagname", "lang", "style", "script", "strong", "blockquote", "form", "iframe", "section", "colspan", "rowspan"));
    private static String LICENSE = "# Licensed to the Apache Software Foundation (ASF) under one or more\n# contributor license agreements.  See the NOTICE file distributed with\n# this work for additional information regarding copyright ownership.\n# The ASF licenses this file to You under the Apache License, Version 2.0\n# (the \"License\"); you may not use this file except in compliance with\n# the License.  You may obtain a copy of the License at\n#\n#     http://www.apache.org/licenses/LICENSE-2.0\n#\n# Unless required by applicable law or agreed to in writing, software\n# distributed under the License is distributed on an \"AS IS\" BASIS,\n# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\n# See the License for the specific language governing permissions and\n# limitations under the License.\n#\n";
    private static int TOP_N = 30000;
    private static int MIN_DOC_FREQ = 10;

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/apache/tika/eval/app/tools/TopCommonTokenCounter$AbstractTokenTFDFPriorityQueue.class */
    public static abstract class AbstractTokenTFDFPriorityQueue extends PriorityQueue<TokenDFTF> {
        AbstractTokenTFDFPriorityQueue(int i) {
            super(i);
        }

        public TokenDFTF[] getArray() {
            TokenDFTF[] tokenDFTFArr = new TokenDFTF[size()];
            TokenDFTF pop = pop();
            int length = tokenDFTFArr.length - 1;
            while (pop != null && length > -1) {
                int i = length;
                length--;
                tokenDFTFArr[i] = pop;
                pop = pop();
            }
            return tokenDFTFArr;
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/apache/tika/eval/app/tools/TopCommonTokenCounter$TokenDFPriorityQueue.class */
    public static class TokenDFPriorityQueue extends AbstractTokenTFDFPriorityQueue {
        TokenDFPriorityQueue(int i) {
            super(i);
        }

        /* JADX INFO: Access modifiers changed from: protected */
        @Override // org.apache.lucene.util.PriorityQueue
        public boolean lessThan(TokenDFTF tokenDFTF, TokenDFTF tokenDFTF2) {
            if (tokenDFTF.df < tokenDFTF2.df) {
                return true;
            }
            return tokenDFTF.df <= tokenDFTF2.df && tokenDFTF2.token.compareTo(tokenDFTF.token) < 0;
        }

        @Override // org.apache.tika.eval.app.tools.TopCommonTokenCounter.AbstractTokenTFDFPriorityQueue
        public TokenDFTF[] getArray() {
            TokenDFTF[] tokenDFTFArr = new TokenDFTF[size()];
            TokenDFTF pop = pop();
            int length = tokenDFTFArr.length - 1;
            while (pop != null && length > -1) {
                int i = length;
                length--;
                tokenDFTFArr[i] = pop;
                pop = pop();
            }
            return tokenDFTFArr;
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:org/apache/tika/eval/app/tools/TopCommonTokenCounter$TokenDFTF.class */
    public static class TokenDFTF {
        final String token;
        final int df;
        final long tf;

        public TokenDFTF(String str, int i, long j) {
            this.token = str;
            this.df = i;
            this.tf = j;
        }

        public long getTF() {
            return this.tf;
        }

        public int getDF() {
            return this.df;
        }

        public String getToken() {
            return this.token;
        }

        public boolean equals(Object obj) {
            if (this == obj) {
                return true;
            }
            if (obj == null || getClass() != obj.getClass()) {
                return false;
            }
            TokenDFTF tokenDFTF = (TokenDFTF) obj;
            if (this.df == tokenDFTF.df && this.tf == tokenDFTF.tf) {
                return Objects.equals(this.token, tokenDFTF.token);
            }
            return false;
        }

        public int hashCode() {
            return (31 * ((31 * (this.token != null ? this.token.hashCode() : 0)) + this.df)) + ((int) (this.tf ^ (this.tf >>> 32)));
        }

        public String toString() {
            return "TokenDFTF{token='" + this.token + "', df=" + this.df + ", tf=" + this.tf + '}';
        }
    }

    public static void main(String[] strArr) throws Exception {
        Path path = Paths.get(strArr[0], new String[0]);
        ArrayList arrayList = new ArrayList();
        for (int i = 1; i < strArr.length; i++) {
            arrayList.add(Paths.get(ProcessUtils.unescapeCommandLine(strArr[i]), new String[0]));
        }
        TopCommonTokenCounter topCommonTokenCounter = new TopCommonTokenCounter();
        if (Files.exists(path, new LinkOption[0])) {
            System.err.println(path.getFileName().toString() + " exists. I'm skipping this.");
        } else {
            topCommonTokenCounter.execute(path, arrayList);
        }
    }

    private static void writeTopN(Path path, long j, long j2, long j3, long j4, AbstractTokenTFDFPriorityQueue abstractTokenTFDFPriorityQueue) throws IOException {
        if (Files.isRegularFile(path, new LinkOption[0])) {
            System.err.println("File " + path.getFileName() + " already exists. Skipping.");
            return;
        }
        Files.createDirectories(path.getParent(), new FileAttribute[0]);
        BufferedWriter newBufferedWriter = Files.newBufferedWriter(path, StandardCharsets.UTF_8, new OpenOption[0]);
        try {
            StringBuilder sb = new StringBuilder();
            newBufferedWriter.write(LICENSE);
            newBufferedWriter.write("#DOC_COUNT\t" + j + StringUtils.LF);
            newBufferedWriter.write("#SUM_DOC_FREQS\t" + j2 + StringUtils.LF);
            newBufferedWriter.write("#SUM_TERM_FREQS\t" + j3 + StringUtils.LF);
            newBufferedWriter.write("#UNIQUE_TERMS\t" + j4 + StringUtils.LF);
            newBufferedWriter.write("#TOKEN\tDOCFREQ\tTERMFREQ\n");
            Iterator<String> it = INCLUDE_LIST.iterator();
            while (it.hasNext()) {
                newBufferedWriter.write(it.next());
                newBufferedWriter.newLine();
            }
            for (TokenDFTF tokenDFTF : abstractTokenTFDFPriorityQueue.getArray()) {
                newBufferedWriter.write(getRow(sb, tokenDFTF) + StringUtils.LF);
            }
            newBufferedWriter.flush();
            if (newBufferedWriter != null) {
                newBufferedWriter.close();
            }
        } catch (Throwable th) {
            if (newBufferedWriter != null) {
                try {
                    newBufferedWriter.close();
                } catch (Throwable th2) {
                    th.addSuppressed(th2);
                }
            }
            throw th;
        }
    }

    private static String getRow(StringBuilder sb, TokenDFTF tokenDFTF) {
        sb.setLength(0);
        sb.append(clean(tokenDFTF.token));
        sb.append("\t").append(tokenDFTF.df);
        sb.append("\t").append(tokenDFTF.tf);
        return sb.toString();
    }

    private static String clean(String str) {
        return str == null ? "" : str.replaceAll("\\s+", " ").trim();
    }

    private void execute(Path path, List<Path> list) throws Exception {
        int indexOf;
        Path createTempDirectory = Files.createTempDirectory("tika-eval-lucene-", new FileAttribute[0]);
        TokenDFPriorityQueue tokenDFPriorityQueue = new TokenDFPriorityQueue(TOP_N);
        long j = -1;
        try {
            FSDirectory open = FSDirectory.open(createTempDirectory);
            try {
                int i = 0;
                IndexWriter indexWriter = new IndexWriter(open, new IndexWriterConfig(AnalyzerManager.newInstance(-1).getCommonTokensAnalyzer()));
                try {
                    ArrayList arrayList = new ArrayList();
                    for (Path path2 : list) {
                        boolean contains = path2.getFileName().toString().contains("-sentences.txt");
                        int i2 = 0;
                        BufferedReader reader = getReader(path2);
                        try {
                            String readLine = reader.readLine();
                            while (readLine != null) {
                                if (contains && (indexOf = readLine.indexOf("\t")) > -1) {
                                    readLine = readLine.substring(indexOf + 1);
                                }
                                i += readLine.length();
                                Document document = new Document();
                                document.add(new TextField(FIELD, readLine, Field.Store.NO));
                                arrayList.add(document);
                                if (i > 1000000) {
                                    indexWriter.addDocuments(arrayList);
                                    arrayList.clear();
                                    i = 0;
                                }
                                readLine = reader.readLine();
                                i2++;
                                if (i2 % 100000 == 0) {
                                    System.out.println("processed " + i2 + " for " + path2.getFileName() + " :: " + path.toAbsolutePath());
                                }
                            }
                            if (reader != null) {
                                reader.close();
                            }
                        } catch (Throwable th) {
                            if (reader != null) {
                                try {
                                    reader.close();
                                } catch (Throwable th2) {
                                    th.addSuppressed(th2);
                                }
                            }
                            throw th;
                        }
                    }
                    if (arrayList.size() > 0) {
                        indexWriter.addDocuments(arrayList);
                    }
                    indexWriter.commit();
                    indexWriter.flush();
                    indexWriter.close();
                    DirectoryReader open2 = DirectoryReader.open(open);
                    try {
                        LeafReader wrap = SlowCompositeReaderWrapper.wrap(open2);
                        long docCount = wrap.getDocCount(FIELD);
                        long sumDocFreq = wrap.getSumDocFreq(FIELD);
                        long sumTotalTermFreq = wrap.getSumTotalTermFreq(FIELD);
                        TermsEnum it = wrap.terms(FIELD).iterator();
                        BytesRef next = it.next();
                        wrap.getDocCount(FIELD);
                        while (next != null) {
                            j++;
                            int docFreq = it.docFreq();
                            long j2 = it.totalTermFreq();
                            if (MIN_DOC_FREQ <= -1 || docFreq >= MIN_DOC_FREQ) {
                                if (tokenDFPriorityQueue.top() == null || tokenDFPriorityQueue.size() < TOP_N || docFreq >= tokenDFPriorityQueue.top().df) {
                                    String utf8ToString = next.utf8ToString();
                                    if (!SKIP_LIST.contains(utf8ToString)) {
                                        tokenDFPriorityQueue.insertWithOverflow(new TokenDFTF(utf8ToString, docFreq, j2));
                                    }
                                }
                                next = it.next();
                            } else {
                                next = it.next();
                            }
                        }
                        if (open2 != null) {
                            open2.close();
                        }
                        if (open != null) {
                            open.close();
                        }
                        writeTopN(path, docCount, sumDocFreq, sumTotalTermFreq, j, tokenDFPriorityQueue);
                    } catch (Throwable th3) {
                        if (open2 != null) {
                            try {
                                open2.close();
                            } catch (Throwable th4) {
                                th3.addSuppressed(th4);
                            }
                        }
                        throw th3;
                    }
                } catch (Throwable th5) {
                    try {
                        indexWriter.close();
                    } catch (Throwable th6) {
                        th5.addSuppressed(th6);
                    }
                    throw th5;
                }
            } finally {
            }
        } finally {
            FileUtils.deleteDirectory(createTempDirectory.toFile());
        }
    }

    private BufferedReader getReader(Path path) throws IOException {
        InputStream newInputStream = Files.newInputStream(path, new OpenOption[0]);
        if (path.toString().endsWith(".gz")) {
            newInputStream = new GzipCompressorInputStream(newInputStream);
        }
        return new BufferedReader(new InputStreamReader(newInputStream, StandardCharsets.UTF_8));
    }
}
