package org.apache.nutch.indexer;

import java.util.Iterator;
import java.util.List;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.parse.ParseResult;
import org.apache.nutch.parse.ParseUtil;
import org.apache.nutch.protocol.Content;
import org.apache.nutch.protocol.Protocol;
import org.apache.nutch.protocol.ProtocolFactory;
import org.apache.nutch.util.NutchConfiguration;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/indexer/IndexingFiltersChecker.class */
public class IndexingFiltersChecker extends Configured implements Tool {
    public static final Logger LOG = LoggerFactory.getLogger(IndexingFiltersChecker.class);
    Configuration conf;

    @Override // org.apache.hadoop.util.Tool
    public int run(String[] strArr) throws Exception {
        if (strArr.length != 1) {
            System.err.println("Usage: IndexingFiltersChecker <url>");
            System.exit(-1);
        }
        String str = strArr[0];
        if (LOG.isInfoEnabled()) {
            LOG.info("fetching: " + str);
        }
        IndexingFilters indexingFilters = new IndexingFilters(this.conf);
        Protocol protocol = new ProtocolFactory(this.conf).getProtocol(str);
        CrawlDatum crawlDatum = new CrawlDatum();
        Content content = protocol.getProtocolOutput(new Text(str), crawlDatum).getContent();
        if (content.getContentType() != null) {
            crawlDatum.getMetaData().put((Writable) new Text("Content-Type"), (Writable) new Text(content.getContentType()));
        }
        if (content == null) {
            System.out.println("No content for " + str);
            return 0;
        }
        String contentType = content.getContentType();
        if (contentType == null) {
            return -1;
        }
        if (LOG.isInfoEnabled()) {
            LOG.info("parsing: " + str);
            LOG.info("contentType: " + contentType);
        }
        ParseResult parse = new ParseUtil(this.conf).parse(content);
        NutchDocument nutchDocument = new NutchDocument();
        Text text = new Text(str);
        try {
            indexingFilters.filter(nutchDocument, parse.get(text), text, crawlDatum, null);
        } catch (IndexingException e) {
            e.printStackTrace();
        }
        for (String str2 : nutchDocument.getFieldNames()) {
            List<Object> values = nutchDocument.getField(str2).getValues();
            if (values != null) {
                Iterator<Object> it = values.iterator();
                while (it.hasNext()) {
                    String obj = it.next().toString();
                    System.out.println(str2 + " :\t" + obj.substring(0, Math.min(100, obj.length())));
                }
            }
        }
        return 0;
    }

    public static void main(String[] strArr) throws Exception {
        System.exit(ToolRunner.run(NutchConfiguration.create(), new IndexingFiltersChecker(), strArr));
    }

    @Override // org.apache.hadoop.conf.Configured, org.apache.hadoop.conf.Configurable
    public Configuration getConf() {
        return this.conf;
    }

    @Override // org.apache.hadoop.conf.Configured, org.apache.hadoop.conf.Configurable
    public void setConf(Configuration configuration) {
        this.conf = configuration;
    }
}
