package org.apache.mahout.text;

import com.google.common.io.Closeables;
import java.io.File;
import java.io.FileFilter;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.regex.Pattern;
import org.apache.commons.cli2.CommandLine;
import org.apache.commons.cli2.Group;
import org.apache.commons.cli2.Option;
import org.apache.commons.cli2.OptionException;
import org.apache.commons.cli2.builder.ArgumentBuilder;
import org.apache.commons.cli2.builder.DefaultOptionBuilder;
import org.apache.commons.cli2.builder.GroupBuilder;
import org.apache.commons.cli2.commandline.Parser;
import org.apache.commons.cli2.option.DefaultOption;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.lucene.analysis.wikipedia.WikipediaTokenizer;
import org.apache.lucene.benchmark.byTask.feeds.DocMaker;
import org.apache.lucene.index.IndexFileNames;
import org.apache.mahout.cf.taste.example.email.EmailUtility;
import org.apache.mahout.common.CommandLineUtil;
import org.apache.mahout.common.commandline.DefaultOptionCreator;
import org.apache.mahout.utils.email.MailOptions;
import org.apache.mahout.utils.email.MailProcessor;
import org.apache.mahout.utils.io.ChunkedWriter;
import org.apache.mahout.utils.regex.RegexMapper;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:org/apache/mahout/text/SequenceFilesFromMailArchives.class */
public final class SequenceFilesFromMailArchives {
    private static final Logger log = LoggerFactory.getLogger(SequenceFilesFromMailArchives.class);

    /* loaded from: input_file:org/apache/mahout/text/SequenceFilesFromMailArchives$PrefixAdditionFilter.class */
    public class PrefixAdditionFilter implements FileFilter {
        private final MailProcessor processor;
        private final ChunkedWriter writer;
        private long messageCount = 0;

        public PrefixAdditionFilter(MailProcessor mailProcessor, ChunkedWriter chunkedWriter) {
            this.processor = mailProcessor;
            this.writer = chunkedWriter;
        }

        public long getMessageCount() {
            return this.messageCount;
        }

        @Override // java.io.FileFilter
        public boolean accept(File file) {
            if (!file.isDirectory()) {
                try {
                    this.messageCount += this.processor.parseMboxLineByLine(file);
                    return false;
                } catch (IOException e) {
                    throw new IllegalStateException("Error processing " + file, e);
                }
            }
            SequenceFilesFromMailArchives.log.info("At {}", file.getAbsolutePath());
            PrefixAdditionFilter prefixAdditionFilter = new PrefixAdditionFilter(new MailProcessor(this.processor.getOptions(), this.processor.getPrefix() + File.separator + file.getName(), this.writer), this.writer);
            file.listFiles(prefixAdditionFilter);
            long messageCount = prefixAdditionFilter.getMessageCount();
            SequenceFilesFromMailArchives.log.info("Parsed {} messages from directory {}", Long.valueOf(messageCount), file.getAbsolutePath());
            this.messageCount += messageCount;
            return false;
        }
    }

    public void createSequenceFiles(MailOptions mailOptions) throws IOException {
        ChunkedWriter chunkedWriter = new ChunkedWriter(new Configuration(), mailOptions.getChunkSize(), new Path(mailOptions.getOutputDir()));
        MailProcessor mailProcessor = new MailProcessor(mailOptions, mailOptions.getPrefix(), chunkedWriter);
        try {
            if (mailOptions.getInput().isDirectory()) {
                PrefixAdditionFilter prefixAdditionFilter = new PrefixAdditionFilter(mailProcessor, chunkedWriter);
                mailOptions.getInput().listFiles(prefixAdditionFilter);
                log.info("Parsed {} messages from {}", Long.valueOf(prefixAdditionFilter.getMessageCount()), mailOptions.getInput().getAbsolutePath());
            } else {
                long currentTimeMillis = System.currentTimeMillis();
                log.info("Parsed {} messages from {} in time: {}", new Object[]{Long.valueOf(mailProcessor.parseMboxLineByLine(mailOptions.getInput())), mailOptions.getInput().getAbsolutePath(), Long.valueOf(System.currentTimeMillis() - currentTimeMillis)});
            }
        } finally {
            Closeables.closeQuietly(chunkedWriter);
        }
    }

    public static void main(String[] strArr) throws Exception {
        DefaultOptionBuilder defaultOptionBuilder = new DefaultOptionBuilder();
        ArgumentBuilder argumentBuilder = new ArgumentBuilder();
        GroupBuilder groupBuilder = new GroupBuilder();
        DefaultOption create = DefaultOptionCreator.inputOption().create();
        DefaultOption create2 = DefaultOptionCreator.outputOption().create();
        DefaultOption create3 = defaultOptionBuilder.withLongName("chunkSize").withArgument(argumentBuilder.withName("chunkSize").withMinimum(1).withMaximum(1).create()).withDescription("The chunkSize in MegaBytes. Defaults to 64").withShortName("chunk").create();
        DefaultOption create4 = defaultOptionBuilder.withLongName("keyPrefix").withArgument(argumentBuilder.withName("keyPrefix").withMinimum(1).withMaximum(1).create()).withDescription("The prefix to be prepended to the key").withShortName("prefix").create();
        DefaultOption create5 = defaultOptionBuilder.withLongName("charset").withRequired(true).withArgument(argumentBuilder.withName("charset").withMinimum(1).withMaximum(1).create()).withDescription("The name of the character encoding of the input files").withShortName(WikipediaTokenizer.CATEGORY).create();
        DefaultOption create6 = defaultOptionBuilder.withLongName("subject").withRequired(false).withDescription("Include the Mail subject as part of the text.  Default is false").withShortName(IndexFileNames.SEPARATE_NORMS_EXTENSION).create();
        DefaultOption create7 = defaultOptionBuilder.withLongName("to").withRequired(false).withDescription("Include the to field in the text.  Default is false").withShortName("to").create();
        DefaultOption create8 = defaultOptionBuilder.withLongName("from").withRequired(false).withDescription("Include the from field in the text.  Default is false").withShortName("from").create();
        DefaultOption create9 = defaultOptionBuilder.withLongName("references").withRequired(false).withDescription("Include the references field in the text.  Default is false").withShortName("refs").create();
        DefaultOption create10 = defaultOptionBuilder.withLongName(DocMaker.BODY_FIELD).withRequired(false).withDescription("Include the body in the output.  Default is false").withShortName(WikipediaTokenizer.BOLD).create();
        DefaultOption create11 = defaultOptionBuilder.withLongName("stripQuoted").withRequired(false).withDescription("Strip (remove) quoted email text in the body.  Default is false").withShortName("q").create();
        DefaultOption create12 = defaultOptionBuilder.withLongName("quotedRegex").withRequired(false).withArgument(argumentBuilder.withName(RegexMapper.REGEX).withMinimum(1).withMaximum(1).create()).withDescription("Specify the regex that identifies quoted text.  Default is to look for > or | at the beginning of the line.").withShortName("q").create();
        DefaultOption create13 = defaultOptionBuilder.withLongName(EmailUtility.SEPARATOR).withRequired(false).withArgument(argumentBuilder.withName(EmailUtility.SEPARATOR).withMinimum(1).withMaximum(1).create()).withDescription("The separator to use between metadata items (to, from, etc.).  Default is \\n").withShortName("sep").create();
        DefaultOption create14 = defaultOptionBuilder.withLongName("bodySeparator").withRequired(false).withArgument(argumentBuilder.withName("bodySeparator").withMinimum(1).withMaximum(1).create()).withDescription("The separator to use between lines in the body.  Default is \\n.  Useful to change if you wish to have the message be on one line").withShortName("bodySep").create();
        Option helpOption = DefaultOptionCreator.helpOption();
        Group create15 = groupBuilder.withName("Options").withOption(create4).withOption(create3).withOption(create5).withOption(create2).withOption(helpOption).withOption(create).withOption(create6).withOption(create7).withOption(create8).withOption(create10).withOption(create11).withOption(create9).withOption(create14).withOption(create12).withOption(create13).create();
        try {
            Parser parser = new Parser();
            parser.setGroup(create15);
            parser.setHelpOption(helpOption);
            CommandLine parse = parser.parse(strArr);
            if (parse.hasOption(helpOption)) {
                CommandLineUtil.printHelp(create15);
                return;
            }
            File file = new File((String) parse.getValue(create));
            String str = (String) parse.getValue(create2);
            int i = 64;
            if (parse.hasOption(create3)) {
                i = Integer.parseInt((String) parse.getValue(create3));
            }
            String str2 = parse.hasOption(create4) ? (String) parse.getValue(create4) : "";
            Charset forName = Charset.forName((String) parse.getValue(create5));
            SequenceFilesFromMailArchives sequenceFilesFromMailArchives = new SequenceFilesFromMailArchives();
            MailOptions mailOptions = new MailOptions();
            mailOptions.setInput(file);
            mailOptions.setOutputDir(str);
            mailOptions.setPrefix(str2);
            mailOptions.setChunkSize(i);
            mailOptions.setCharset(forName);
            ArrayList arrayList = new ArrayList(5);
            HashMap hashMap = new HashMap();
            int i2 = 0;
            if (parse.hasOption(create8)) {
                arrayList.add(MailProcessor.FROM_PREFIX);
                i2 = 0 + 1;
                hashMap.put(MailOptions.FROM, 0);
            }
            if (parse.hasOption(create7)) {
                arrayList.add(MailProcessor.TO_PREFIX);
                int i3 = i2;
                i2++;
                hashMap.put(MailOptions.TO, Integer.valueOf(i3));
            }
            if (parse.hasOption(create9)) {
                arrayList.add(MailProcessor.REFS_PREFIX);
                int i4 = i2;
                i2++;
                hashMap.put(MailOptions.REFS, Integer.valueOf(i4));
            }
            if (parse.hasOption(create6)) {
                arrayList.add(MailProcessor.SUBJECT_PREFIX);
                int i5 = i2;
                int i6 = i2 + 1;
                hashMap.put(MailOptions.SUBJECT, Integer.valueOf(i5));
            }
            mailOptions.setStripQuotedText(parse.hasOption(create11));
            mailOptions.setPatternsToMatch((Pattern[]) arrayList.toArray(new Pattern[arrayList.size()]));
            mailOptions.setPatternOrder(hashMap);
            mailOptions.setIncludeBody(parse.hasOption(create10));
            mailOptions.setSeparator("\n");
            if (parse.hasOption(create13)) {
                mailOptions.setSeparator(parse.getValue(create13).toString());
            }
            if (parse.hasOption(create14)) {
                mailOptions.setBodySeparator(parse.getValue(create14).toString());
            }
            if (parse.hasOption(create12)) {
                mailOptions.setQuotedTextPattern(Pattern.compile(parse.getValue(create12).toString()));
            }
            long currentTimeMillis = System.currentTimeMillis();
            sequenceFilesFromMailArchives.createSequenceFiles(mailOptions);
            log.info("Conversion took {}ms", Long.valueOf(System.currentTimeMillis() - currentTimeMillis));
        } catch (OptionException e) {
            log.error("Exception", (Throwable) e);
            CommandLineUtil.printHelp(create15);
        }
    }
}
