package org.apache.nutch.scoring.webgraph;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Iterator;
import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hdfs.protocol.FSConstants;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.WritableUtils;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.mapred.TextOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.Generator;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.URLUtil;
import org.apache.tools.ant.types.selectors.DepthSelector;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.testng.reporters.XMLReporterConfig;

/* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/scoring/webgraph/NodeDumper.class */
public class NodeDumper extends Configured implements Tool {
    public static final Logger LOG = LoggerFactory.getLogger(NodeDumper.class);

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/scoring/webgraph/NodeDumper$AggrType.class */
    public enum AggrType {
        SUM,
        MAX
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/scoring/webgraph/NodeDumper$DumpType.class */
    public enum DumpType {
        INLINKS,
        OUTLINKS,
        SCORES
    }

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/scoring/webgraph/NodeDumper$Dumper.class */
    public static class Dumper extends Configured implements Mapper<Text, Node, Text, FloatWritable>, Reducer<Text, FloatWritable, Text, FloatWritable> {
        private JobConf conf;
        private boolean inlinks = false;
        private boolean outlinks = false;
        private boolean scores = false;
        private long topn = FSConstants.QUOTA_DONT_SET;
        private boolean host = false;
        private boolean domain = false;
        private boolean sum = false;
        private boolean max = false;

        @Override // org.apache.hadoop.mapred.JobConfigurable
        public void configure(JobConf jobConf) {
            this.conf = jobConf;
            this.inlinks = jobConf.getBoolean(WebGraph.INLINK_DIR, false);
            this.outlinks = jobConf.getBoolean(WebGraph.OUTLINK_DIR, false);
            this.scores = jobConf.getBoolean("scores", true);
            this.topn = jobConf.getLong("topn", FSConstants.QUOTA_DONT_SET);
            this.host = jobConf.getBoolean(Generator.GENERATOR_COUNT_VALUE_HOST, false);
            this.domain = jobConf.getBoolean(Generator.GENERATOR_COUNT_VALUE_DOMAIN, false);
            this.sum = jobConf.getBoolean("sum", false);
            this.max = jobConf.getBoolean(DepthSelector.MAX_KEY, false);
        }

        @Override // java.io.Closeable, java.lang.AutoCloseable
        public void close() {
        }

        @Override // org.apache.hadoop.mapred.Mapper
        public void map(Text text, Node node, OutputCollector<Text, FloatWritable> outputCollector, Reporter reporter) throws IOException {
            float numInlinks = this.inlinks ? node.getNumInlinks() : this.outlinks ? node.getNumOutlinks() : node.getInlinkScore();
            if (this.host) {
                text.set(URLUtil.getHost(text.toString()));
            } else {
                text.set(URLUtil.getDomainName(text.toString()));
            }
            outputCollector.collect(text, new FloatWritable(numInlinks));
        }

        @Override // org.apache.hadoop.mapred.Reducer
        public void reduce(Text text, Iterator<FloatWritable> it, OutputCollector<Text, FloatWritable> outputCollector, Reporter reporter) throws IOException {
            float f = 0.0f;
            for (long j = 0; it.hasNext() && j < this.topn; j++) {
                float f2 = it.next().get();
                if (this.sum) {
                    f += f2;
                } else if (f < f2) {
                    f = f2;
                }
            }
            outputCollector.collect(text, new FloatWritable(f));
        }
    }

    /* JADX INFO: Access modifiers changed from: private */
    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/scoring/webgraph/NodeDumper$NameType.class */
    public enum NameType {
        HOST,
        DOMAIN
    }

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/scoring/webgraph/NodeDumper$Sorter.class */
    public static class Sorter extends Configured implements Mapper<Text, Node, FloatWritable, Text>, Reducer<FloatWritable, Text, Text, FloatWritable> {
        private JobConf conf;
        private boolean inlinks = false;
        private boolean outlinks = false;
        private boolean scores = false;
        private long topn = FSConstants.QUOTA_DONT_SET;

        @Override // org.apache.hadoop.mapred.JobConfigurable
        public void configure(JobConf jobConf) {
            this.conf = jobConf;
            this.inlinks = jobConf.getBoolean(WebGraph.INLINK_DIR, false);
            this.outlinks = jobConf.getBoolean(WebGraph.OUTLINK_DIR, false);
            this.scores = jobConf.getBoolean("scores", true);
            this.topn = jobConf.getLong("topn", FSConstants.QUOTA_DONT_SET);
        }

        @Override // java.io.Closeable, java.lang.AutoCloseable
        public void close() {
        }

        @Override // org.apache.hadoop.mapred.Mapper
        public void map(Text text, Node node, OutputCollector<FloatWritable, Text> outputCollector, Reporter reporter) throws IOException {
            outputCollector.collect(new FloatWritable(-(this.inlinks ? node.getNumInlinks() : this.outlinks ? node.getNumOutlinks() : node.getInlinkScore())), text);
        }

        @Override // org.apache.hadoop.mapred.Reducer
        public void reduce(FloatWritable floatWritable, Iterator<Text> it, OutputCollector<Text, FloatWritable> outputCollector, Reporter reporter) throws IOException {
            float f = floatWritable.get();
            FloatWritable floatWritable2 = new FloatWritable(f == 0.0f ? 0.0f : -f);
            long j = 0;
            while (true) {
                long j2 = j;
                if (!it.hasNext() || j2 >= this.topn) {
                    return;
                }
                outputCollector.collect((Text) WritableUtils.clone(it.next(), this.conf), floatWritable2);
                j = j2 + 1;
            }
        }
    }

    public void dumpNodes(Path path, DumpType dumpType, long j, Path path2, boolean z, NameType nameType, AggrType aggrType, boolean z2) throws Exception {
        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long currentTimeMillis = System.currentTimeMillis();
        LOG.info("NodeDumper: starting at " + simpleDateFormat.format(Long.valueOf(currentTimeMillis)));
        Path path3 = new Path(path, WebGraph.NODE_DIR);
        NutchJob nutchJob = new NutchJob(getConf());
        nutchJob.setJobName("NodeDumper: " + path);
        FileInputFormat.addInputPath(nutchJob, path3);
        nutchJob.setInputFormat(SequenceFileInputFormat.class);
        if (nameType == null) {
            nutchJob.setMapperClass(Sorter.class);
            nutchJob.setReducerClass(Sorter.class);
            nutchJob.setMapOutputKeyClass(FloatWritable.class);
            nutchJob.setMapOutputValueClass(Text.class);
        } else {
            nutchJob.setMapperClass(Dumper.class);
            nutchJob.setReducerClass(Dumper.class);
            nutchJob.setMapOutputKeyClass(Text.class);
            nutchJob.setMapOutputValueClass(FloatWritable.class);
        }
        nutchJob.setOutputKeyClass(Text.class);
        nutchJob.setOutputValueClass(FloatWritable.class);
        FileOutputFormat.setOutputPath(nutchJob, path2);
        if (z2) {
            nutchJob.setOutputFormat(SequenceFileOutputFormat.class);
        } else {
            nutchJob.setOutputFormat(TextOutputFormat.class);
        }
        nutchJob.setNumReduceTasks(1);
        nutchJob.setBoolean(WebGraph.INLINK_DIR, dumpType == DumpType.INLINKS);
        nutchJob.setBoolean(WebGraph.OUTLINK_DIR, dumpType == DumpType.OUTLINKS);
        nutchJob.setBoolean("scores", dumpType == DumpType.SCORES);
        nutchJob.setBoolean(Generator.GENERATOR_COUNT_VALUE_HOST, nameType == NameType.HOST);
        nutchJob.setBoolean(Generator.GENERATOR_COUNT_VALUE_DOMAIN, nameType == NameType.DOMAIN);
        nutchJob.setBoolean("sum", aggrType == AggrType.SUM);
        nutchJob.setBoolean(DepthSelector.MAX_KEY, aggrType == AggrType.MAX);
        nutchJob.setLong("topn", j);
        if (z) {
            nutchJob.set("mapred.textoutputformat.separator", "=");
        }
        try {
            LOG.info("NodeDumper: running");
            JobClient.runJob(nutchJob);
            long currentTimeMillis2 = System.currentTimeMillis();
            LOG.info("NodeDumper: finished at " + simpleDateFormat.format(Long.valueOf(currentTimeMillis2)) + ", elapsed: " + TimingUtil.elapsedTime(currentTimeMillis, currentTimeMillis2));
        } catch (IOException e) {
            LOG.error(StringUtils.stringifyException(e));
            throw e;
        }
    }

    public static void main(String[] strArr) throws Exception {
        System.exit(ToolRunner.run(NutchConfiguration.create(), new NodeDumper(), strArr));
    }

    @Override // org.apache.hadoop.util.Tool
    public int run(String[] strArr) throws Exception {
        Options options = new Options();
        OptionBuilder.withArgName("help");
        OptionBuilder.withDescription("show this help message");
        Option create = OptionBuilder.create("help");
        OptionBuilder.withArgName("webgraphdb");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("the web graph database to use");
        Option create2 = OptionBuilder.create("webgraphdb");
        OptionBuilder.withArgName(WebGraph.INLINK_DIR);
        OptionBuilder.withDescription("show highest inlinks");
        Option create3 = OptionBuilder.create(WebGraph.INLINK_DIR);
        OptionBuilder.withArgName(WebGraph.OUTLINK_DIR);
        OptionBuilder.withDescription("show highest outlinks");
        Option create4 = OptionBuilder.create(WebGraph.OUTLINK_DIR);
        OptionBuilder.withArgName("scores");
        OptionBuilder.withDescription("show highest scores");
        Option create5 = OptionBuilder.create("scores");
        OptionBuilder.withArgName("topn");
        OptionBuilder.hasOptionalArg();
        OptionBuilder.withDescription("show topN scores");
        Option create6 = OptionBuilder.create("topn");
        OptionBuilder.withArgName("output");
        OptionBuilder.hasArg();
        OptionBuilder.withDescription("the output directory to use");
        Option create7 = OptionBuilder.create("output");
        OptionBuilder.withArgName("asEff");
        OptionBuilder.withDescription("Solr ExternalFileField compatible output format");
        Option create8 = OptionBuilder.create("asEff");
        OptionBuilder.hasArgs(2);
        OptionBuilder.withDescription("group <host|domain> <sum|max>");
        Option create9 = OptionBuilder.create(XMLReporterConfig.TAG_GROUP);
        OptionBuilder.withArgName("asSequenceFile");
        OptionBuilder.withDescription("whether to output as a sequencefile");
        Option create10 = OptionBuilder.create("asSequenceFile");
        options.addOption(create);
        options.addOption(create2);
        options.addOption(create3);
        options.addOption(create4);
        options.addOption(create5);
        options.addOption(create6);
        options.addOption(create7);
        options.addOption(create8);
        options.addOption(create9);
        options.addOption(create10);
        try {
            CommandLine parse = new GnuParser().parse(options, strArr);
            if (parse.hasOption("help") || !parse.hasOption("webgraphdb")) {
                new HelpFormatter().printHelp("NodeDumper", options);
                return -1;
            }
            String optionValue = parse.getOptionValue("webgraphdb");
            boolean hasOption = parse.hasOption(WebGraph.INLINK_DIR);
            boolean hasOption2 = parse.hasOption(WebGraph.OUTLINK_DIR);
            parse.hasOption("scores");
            long parseLong = parse.hasOption("topn") ? Long.parseLong(parse.getOptionValue("topn")) : FSConstants.QUOTA_DONT_SET;
            String optionValue2 = parse.getOptionValue("output");
            DumpType dumpType = hasOption ? DumpType.INLINKS : hasOption2 ? DumpType.OUTLINKS : DumpType.SCORES;
            NameType nameType = null;
            AggrType aggrType = null;
            String[] optionValues = parse.getOptionValues(XMLReporterConfig.TAG_GROUP);
            if (optionValues != null && optionValues.length == 2) {
                nameType = optionValues[0].equals(Generator.GENERATOR_COUNT_VALUE_HOST) ? NameType.HOST : optionValues[0].equals(Generator.GENERATOR_COUNT_VALUE_DOMAIN) ? NameType.DOMAIN : null;
                aggrType = optionValues[1].equals("sum") ? AggrType.SUM : optionValues[1].equals("sum") ? AggrType.MAX : null;
            }
            dumpNodes(new Path(optionValue), dumpType, parseLong, new Path(optionValue2), parse.hasOption("asEff"), nameType, aggrType, parse.hasOption("asSequenceFile"));
            return 0;
        } catch (Exception e) {
            LOG.error("NodeDumper: " + StringUtils.stringifyException(e));
            return -2;
        }
    }
}
