package org.apache.nutch.indexer;

import java.io.IOException;
import java.util.Collection;
import java.util.Iterator;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.crawl.Inlinks;
import org.apache.nutch.crawl.NutchWritable;
import org.apache.nutch.indexer.solr.SolrConstants;
import org.apache.nutch.metadata.Metadata;
import org.apache.nutch.metadata.Nutch;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.parse.ParseImpl;
import org.apache.nutch.parse.ParseText;
import org.apache.nutch.scoring.ScoringFilterException;
import org.apache.nutch.scoring.ScoringFilters;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/indexer/IndexerMapReduce.class */
public class IndexerMapReduce extends Configured implements Mapper<Text, Writable, Text, NutchWritable>, Reducer<Text, NutchWritable, Text, NutchIndexAction> {
    public static final Logger LOG = LoggerFactory.getLogger(IndexerMapReduce.class);
    public static final String INDEXER_DELETE = "indexer.delete";
    private boolean delete = false;
    private IndexingFilters filters;
    private ScoringFilters scfilters;

    @Override // org.apache.hadoop.mapred.JobConfigurable
    public void configure(JobConf jobConf) {
        setConf(jobConf);
        this.filters = new IndexingFilters(getConf());
        this.scfilters = new ScoringFilters(getConf());
        this.delete = jobConf.getBoolean(INDEXER_DELETE, false);
    }

    @Override // org.apache.hadoop.mapred.Mapper
    public void map(Text text, Writable writable, OutputCollector<Text, NutchWritable> outputCollector, Reporter reporter) throws IOException {
        outputCollector.collect(text, new NutchWritable(writable));
    }

    @Override // org.apache.hadoop.mapred.Reducer
    public void reduce(Text text, Iterator<NutchWritable> it, OutputCollector<Text, NutchIndexAction> outputCollector, Reporter reporter) throws IOException {
        Inlinks inlinks = null;
        CrawlDatum crawlDatum = null;
        CrawlDatum crawlDatum2 = null;
        ParseData parseData = null;
        ParseText parseText = null;
        while (it.hasNext()) {
            Writable writable = it.next().get();
            if (writable instanceof Inlinks) {
                inlinks = (Inlinks) writable;
            } else if (writable instanceof CrawlDatum) {
                CrawlDatum crawlDatum3 = (CrawlDatum) writable;
                if (CrawlDatum.hasDbStatus(crawlDatum3)) {
                    crawlDatum = crawlDatum3;
                } else if (CrawlDatum.hasFetchStatus(crawlDatum3)) {
                    if (crawlDatum3.getStatus() != 38) {
                        crawlDatum2 = crawlDatum3;
                        if (this.delete) {
                            if (crawlDatum2.getStatus() == 37) {
                                reporter.incrCounter("IndexerStatus", "Documents deleted", 1L);
                                outputCollector.collect(text, new NutchIndexAction(null, (byte) 1));
                            } else if (crawlDatum2.getStatus() == 36) {
                                reporter.incrCounter("IndexerStatus", "Perm redirects deleted", 1L);
                                outputCollector.collect(text, new NutchIndexAction(null, (byte) 1));
                            }
                        }
                    }
                } else if (67 != crawlDatum3.getStatus() && 65 != crawlDatum3.getStatus() && 68 != crawlDatum3.getStatus()) {
                    throw new RuntimeException("Unexpected status: " + ((int) crawlDatum3.getStatus()));
                }
            } else if (writable instanceof ParseData) {
                parseData = (ParseData) writable;
            } else if (writable instanceof ParseText) {
                parseText = (ParseText) writable;
            } else if (LOG.isWarnEnabled()) {
                LOG.warn("Unrecognized type: " + writable.getClass());
            }
        }
        if (crawlDatum2 == null || crawlDatum == null || parseText == null || parseData == null || !parseData.getStatus().isSuccess() || crawlDatum2.getStatus() != 33) {
            return;
        }
        NutchDocument nutchDocument = new NutchDocument();
        Metadata contentMeta = parseData.getContentMeta();
        nutchDocument.add("segment", contentMeta.get(Nutch.SEGMENT_NAME_KEY));
        nutchDocument.add("digest", contentMeta.get(Nutch.SIGNATURE_KEY));
        ParseImpl parseImpl = new ParseImpl(parseText, parseData);
        try {
            Text text2 = (Text) crawlDatum.getMetaData().get((Object) Nutch.WRITABLE_REPR_URL_KEY);
            if (text2 != null) {
                crawlDatum2.getMetaData().put((Writable) Nutch.WRITABLE_REPR_URL_KEY, (Writable) text2);
            }
            NutchDocument filter = this.filters.filter(nutchDocument, parseImpl, text, crawlDatum2, inlinks);
            if (filter == null) {
                reporter.incrCounter("IndexerStatus", "Skipped by filters", 1L);
                return;
            }
            try {
                float indexerScore = this.scfilters.indexerScore(text, filter, crawlDatum, crawlDatum2, parseImpl, inlinks, 1.0f);
                filter.setWeight(indexerScore);
                filter.add(SolrConstants.BOOST_FIELD, Float.toString(indexerScore));
                reporter.incrCounter("IndexerStatus", "Documents added", 1L);
                outputCollector.collect(text, new NutchIndexAction(filter, (byte) 0));
            } catch (ScoringFilterException e) {
                if (LOG.isWarnEnabled()) {
                    LOG.warn("Error calculating score " + text + ": " + e);
                }
            }
        } catch (IndexingException e2) {
            if (LOG.isWarnEnabled()) {
                LOG.warn("Error indexing " + text + ": " + e2);
            }
            reporter.incrCounter("IndexerStatus", "Errors", 1L);
        }
    }

    @Override // java.io.Closeable, java.lang.AutoCloseable
    public void close() throws IOException {
    }

    public static void initMRJob(Path path, Path path2, Collection<Path> collection, JobConf jobConf) {
        LOG.info("IndexerMapReduce: crawldb: " + path);
        if (path2 != null) {
            LOG.info("IndexerMapReduce: linkdb: " + path2);
        }
        for (Path path3 : collection) {
            LOG.info("IndexerMapReduces: adding segment: " + path3);
            FileInputFormat.addInputPath(jobConf, new Path(path3, CrawlDatum.FETCH_DIR_NAME));
            FileInputFormat.addInputPath(jobConf, new Path(path3, CrawlDatum.PARSE_DIR_NAME));
            FileInputFormat.addInputPath(jobConf, new Path(path3, ParseData.DIR_NAME));
            FileInputFormat.addInputPath(jobConf, new Path(path3, ParseText.DIR_NAME));
        }
        FileInputFormat.addInputPath(jobConf, new Path(path, "current"));
        if (path2 != null) {
            FileInputFormat.addInputPath(jobConf, new Path(path2, "current"));
        }
        jobConf.setInputFormat(SequenceFileInputFormat.class);
        jobConf.setMapperClass(IndexerMapReduce.class);
        jobConf.setReducerClass(IndexerMapReduce.class);
        jobConf.setOutputFormat(IndexerOutputFormat.class);
        jobConf.setOutputKeyClass(Text.class);
        jobConf.setMapOutputValueClass(NutchWritable.class);
        jobConf.setOutputValueClass(NutchWritable.class);
    }
}
