package org.apache.lucene.benchmark.utils;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileFilter;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.configuration.tree.DefaultExpressionEngine;

/* loaded from: input_file:org/apache/lucene/benchmark/utils/ExtractReuters.class */
public class ExtractReuters {
    private File reutersDir;
    private File outputDir;
    Pattern EXTRACTION_PATTERN = Pattern.compile("<TITLE>(.*?)</TITLE>|<DATE>(.*?)</DATE>|<BODY>(.*?)</BODY>");
    private static final String LINE_SEPARATOR = System.getProperty("line.separator");
    private static String[] META_CHARS = {"&", "<", ">", "\"", "'"};
    private static String[] META_CHARS_SERIALIZATIONS = {"&amp;", "&lt;", "&gt;", "&quot;", "&apos;"};

    public ExtractReuters(File file, File file2) {
        this.reutersDir = file;
        this.outputDir = file2;
        System.out.println("Deleting all files in " + file2);
        for (File file3 : file2.listFiles()) {
            file3.delete();
        }
    }

    public void extract() {
        File[] listFiles = this.reutersDir.listFiles(new FileFilter() { // from class: org.apache.lucene.benchmark.utils.ExtractReuters.1
            @Override // java.io.FileFilter
            public boolean accept(File file) {
                return file.getName().endsWith(".sgm");
            }
        });
        if (listFiles == null || listFiles.length <= 0) {
            System.err.println("No .sgm files in " + this.reutersDir);
            return;
        }
        for (File file : listFiles) {
            extractFile(file);
        }
    }

    protected void extractFile(File file) {
        try {
            BufferedReader bufferedReader = new BufferedReader(new FileReader(file));
            StringBuilder sb = new StringBuilder(1024);
            StringBuilder sb2 = new StringBuilder(1024);
            int i = 0;
            while (true) {
                String readLine = bufferedReader.readLine();
                if (readLine == null) {
                    bufferedReader.close();
                    return;
                }
                if (readLine.indexOf("</REUTERS") == -1) {
                    sb.append(readLine).append(' ');
                } else {
                    Matcher matcher = this.EXTRACTION_PATTERN.matcher(sb);
                    while (matcher.find()) {
                        for (int i2 = 1; i2 <= matcher.groupCount(); i2++) {
                            if (matcher.group(i2) != null) {
                                sb2.append(matcher.group(i2));
                            }
                        }
                        sb2.append(LINE_SEPARATOR).append(LINE_SEPARATOR);
                    }
                    String sb3 = sb2.toString();
                    for (int i3 = 0; i3 < META_CHARS_SERIALIZATIONS.length; i3++) {
                        sb3 = sb3.replaceAll(META_CHARS_SERIALIZATIONS[i3], META_CHARS[i3]);
                    }
                    int i4 = i;
                    i++;
                    FileWriter fileWriter = new FileWriter(new File(this.outputDir, file.getName() + "-" + i4 + ".txt"));
                    fileWriter.write(sb3);
                    fileWriter.close();
                    sb2.setLength(0);
                    sb.setLength(0);
                }
            }
        } catch (IOException e) {
            throw new RuntimeException(e);
        }
    }

    public static void main(String[] strArr) {
        if (strArr.length != 2) {
            usage("Wrong number of arguments (" + strArr.length + DefaultExpressionEngine.DEFAULT_INDEX_END);
            return;
        }
        File file = new File(strArr[0]);
        if (!file.exists()) {
            usage("Cannot find Path to Reuters SGM files (" + file + DefaultExpressionEngine.DEFAULT_INDEX_END);
            return;
        }
        File file2 = new File(new File(strArr[1]).getAbsolutePath() + "-tmp");
        file2.mkdirs();
        new ExtractReuters(file, file2).extract();
        file2.renameTo(new File(strArr[1]));
    }

    private static void usage(String str) {
        System.err.println("Usage: " + str + " :: java -cp <...> org.apache.lucene.benchmark.utils.ExtractReuters <Path to Reuters SGM files> <Output Path>");
    }
}
