package org.apache.nutch.crawl;

import java.io.IOException;
import java.text.SimpleDateFormat;
import java.util.Iterator;
import java.util.Random;
import java.util.TreeMap;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.WritableComparable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileOutputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/crawl/Injector.class */
public class Injector extends Configured implements Tool {
    public static final Logger LOG = LoggerFactory.getLogger(Injector.class);
    public static String nutchScoreMDName = "nutch.score";
    public static String nutchFetchIntervalMDName = "nutch.fetchInterval";

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/crawl/Injector$InjectMapper.class */
    public static class InjectMapper implements Mapper<WritableComparable, Text, Text, CrawlDatum> {
        private URLNormalizers urlNormalizers;
        private int interval;
        private float scoreInjected;
        private JobConf jobConf;
        private URLFilters filters;
        private ScoringFilters scfilters;
        private long curTime;

        @Override // org.apache.hadoop.mapred.JobConfigurable
        public void configure(JobConf jobConf) {
            this.jobConf = jobConf;
            this.urlNormalizers = new URLNormalizers(jobConf, URLNormalizers.SCOPE_INJECT);
            this.interval = this.jobConf.getInt("db.fetch.interval.default", 2592000);
            this.filters = new URLFilters(this.jobConf);
            this.scfilters = new ScoringFilters(this.jobConf);
            this.scoreInjected = this.jobConf.getFloat("db.score.injected", 1.0f);
            this.curTime = jobConf.getLong("injector.current.time", System.currentTimeMillis());
        }

        @Override // java.io.Closeable, java.lang.AutoCloseable
        public void close() {
        }

        @Override // org.apache.hadoop.mapred.Mapper
        public void map(WritableComparable writableComparable, Text text, OutputCollector<Text, CrawlDatum> outputCollector, Reporter reporter) throws IOException {
            String text2 = text.toString();
            if (text2 == null || !text2.trim().startsWith("#")) {
                float f = -1.0f;
                int i = this.interval;
                TreeMap treeMap = new TreeMap();
                if (text2.indexOf("\t") != -1) {
                    String[] split = text2.split("\t");
                    text2 = split[0];
                    for (int i2 = 1; i2 < split.length; i2++) {
                        int indexOf = split[i2].indexOf("=");
                        if (indexOf != -1) {
                            String substring = split[i2].substring(0, indexOf);
                            String substring2 = split[i2].substring(indexOf + 1);
                            if (substring.equals(Injector.nutchScoreMDName)) {
                                try {
                                    f = Float.parseFloat(substring2);
                                } catch (NumberFormatException e) {
                                }
                            } else if (substring.equals(Injector.nutchFetchIntervalMDName)) {
                                try {
                                    i = Integer.parseInt(substring2);
                                } catch (NumberFormatException e2) {
                                }
                            } else {
                                treeMap.put(substring, substring2);
                            }
                        }
                    }
                }
                try {
                    text2 = this.filters.filter(this.urlNormalizers.normalize(text2, URLNormalizers.SCOPE_INJECT));
                } catch (Exception e3) {
                    if (Injector.LOG.isWarnEnabled()) {
                        Injector.LOG.warn("Skipping " + text2 + ":" + e3);
                    }
                    text2 = null;
                }
                if (text2 != null) {
                    text.set(text2);
                    CrawlDatum crawlDatum = new CrawlDatum(66, i);
                    crawlDatum.setFetchTime(this.curTime);
                    for (String str : treeMap.keySet()) {
                        crawlDatum.getMetaData().put((Writable) new Text(str), (Writable) new Text((String) treeMap.get(str)));
                    }
                    if (f != -1.0f) {
                        crawlDatum.setScore(f);
                    } else {
                        crawlDatum.setScore(this.scoreInjected);
                    }
                    try {
                        this.scfilters.injectedScore(text, crawlDatum);
                    } catch (ScoringFilterException e4) {
                        if (Injector.LOG.isWarnEnabled()) {
                            Injector.LOG.warn("Cannot filter injected score for url " + text2 + ", using default (" + e4.getMessage() + ")");
                        }
                    }
                    outputCollector.collect(text, crawlDatum);
                }
            }
        }
    }

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/crawl/Injector$InjectReducer.class */
    public static class InjectReducer implements Reducer<Text, CrawlDatum, Text, CrawlDatum> {
        private CrawlDatum old = new CrawlDatum();
        private CrawlDatum injected = new CrawlDatum();

        @Override // org.apache.hadoop.mapred.JobConfigurable
        public void configure(JobConf jobConf) {
        }

        @Override // java.io.Closeable, java.lang.AutoCloseable
        public void close() {
        }

        @Override // org.apache.hadoop.mapred.Reducer
        public void reduce(Text text, Iterator<CrawlDatum> it, OutputCollector<Text, CrawlDatum> outputCollector, Reporter reporter) throws IOException {
            boolean z = false;
            while (it.hasNext()) {
                CrawlDatum next = it.next();
                if (next.getStatus() == 66) {
                    this.injected.set(next);
                    this.injected.setStatus(1);
                } else {
                    this.old.set(next);
                    z = true;
                }
            }
            outputCollector.collect(text, z ? this.old : this.injected);
        }
    }

    public Injector() {
    }

    public Injector(Configuration configuration) {
        setConf(configuration);
    }

    public void inject(Path path, Path path2) throws IOException {
        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long currentTimeMillis = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("Injector: starting at " + simpleDateFormat.format(Long.valueOf(currentTimeMillis)));
            LOG.info("Injector: crawlDb: " + path);
            LOG.info("Injector: urlDir: " + path2);
        }
        Path path3 = new Path(getConf().get("mapred.temp.dir", Path.CUR_DIR) + "/inject-temp-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        if (LOG.isInfoEnabled()) {
            LOG.info("Injector: Converting injected urls to crawl db entries.");
        }
        NutchJob nutchJob = new NutchJob(getConf());
        nutchJob.setJobName("inject " + path2);
        FileInputFormat.addInputPath(nutchJob, path2);
        nutchJob.setMapperClass(InjectMapper.class);
        FileOutputFormat.setOutputPath(nutchJob, path3);
        nutchJob.setOutputFormat(SequenceFileOutputFormat.class);
        nutchJob.setOutputKeyClass(Text.class);
        nutchJob.setOutputValueClass(CrawlDatum.class);
        nutchJob.setLong("injector.current.time", System.currentTimeMillis());
        JobClient.runJob(nutchJob);
        if (LOG.isInfoEnabled()) {
            LOG.info("Injector: Merging injected urls into crawl db.");
        }
        JobConf createJob = CrawlDb.createJob(getConf(), path);
        FileInputFormat.addInputPath(createJob, path3);
        createJob.setReducerClass(InjectReducer.class);
        JobClient.runJob(createJob);
        CrawlDb.install(createJob, path);
        FileSystem.get(getConf()).delete(path3, true);
        long currentTimeMillis2 = System.currentTimeMillis();
        LOG.info("Injector: finished at " + simpleDateFormat.format(Long.valueOf(currentTimeMillis2)) + ", elapsed: " + TimingUtil.elapsedTime(currentTimeMillis, currentTimeMillis2));
    }

    public static void main(String[] strArr) throws Exception {
        System.exit(ToolRunner.run(NutchConfiguration.create(), new Injector(), strArr));
    }

    @Override // org.apache.hadoop.util.Tool
    public int run(String[] strArr) throws Exception {
        if (strArr.length < 2) {
            System.err.println("Usage: Injector <crawldb> <url_dir>");
            return -1;
        }
        try {
            inject(new Path(strArr[0]), new Path(strArr[1]));
            return 0;
        } catch (Exception e) {
            LOG.error("Injector: " + StringUtils.stringifyException(e));
            return -1;
        }
    }
}
