package org.apache.nutch.indexer.solr;

import java.io.DataInput;
import java.io.DataOutput;
import java.io.IOException;
import java.net.MalformedURLException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.mapred.InputFormat;
import org.apache.hadoop.mapred.InputSplit;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.RecordReader;
import org.apache.hadoop.mapred.Reducer;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.lib.IdentityMapper;
import org.apache.hadoop.mapred.lib.NullOutputFormat;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
import org.apache.solr.client.solrj.SolrQuery;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.SolrServerException;
import org.apache.solr.client.solrj.impl.CommonsHttpSolrServer;
import org.apache.solr.client.solrj.request.UpdateRequest;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/indexer/solr/SolrDeleteDuplicates.class */
public class SolrDeleteDuplicates implements Reducer<Text, SolrRecord, Text, SolrRecord>, Tool {
    public static final Logger LOG = LoggerFactory.getLogger(SolrDeleteDuplicates.class);
    private static final String SOLR_GET_ALL_QUERY = "id:[* TO *]";
    private static final int NUM_MAX_DELETE_REQUEST = 1000;
    private Configuration conf;
    private SolrServer solr;
    private boolean noCommit = false;
    private int numDeletes = 0;
    private UpdateRequest updateRequest = new UpdateRequest();

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/indexer/solr/SolrDeleteDuplicates$SolrInputFormat.class */
    public static class SolrInputFormat implements InputFormat<Text, SolrRecord> {
        @Override // org.apache.hadoop.mapred.InputFormat
        public InputSplit[] getSplits(JobConf jobConf, int i) throws IOException {
            CommonsHttpSolrServer commonsHttpSolrServer = SolrUtils.getCommonsHttpSolrServer(jobConf);
            SolrQuery solrQuery = new SolrQuery(SolrDeleteDuplicates.SOLR_GET_ALL_QUERY);
            solrQuery.setFields(new String[]{SolrConstants.ID_FIELD});
            solrQuery.setRows(1);
            try {
                int numFound = (int) commonsHttpSolrServer.query(solrQuery).getResults().getNumFound();
                int i2 = numFound / i;
                int i3 = 0;
                SolrInputSplit[] solrInputSplitArr = new SolrInputSplit[i];
                for (int i4 = 0; i4 < i - 1; i4++) {
                    solrInputSplitArr[i4] = new SolrInputSplit(i3, i2);
                    i3 += i2;
                }
                solrInputSplitArr[solrInputSplitArr.length - 1] = new SolrInputSplit(i3, numFound - i3);
                return solrInputSplitArr;
            } catch (SolrServerException e) {
                throw new IOException((Throwable) e);
            }
        }

        @Override // org.apache.hadoop.mapred.InputFormat
        public RecordReader<Text, SolrRecord> getRecordReader(InputSplit inputSplit, JobConf jobConf, Reporter reporter) throws IOException {
            CommonsHttpSolrServer commonsHttpSolrServer = SolrUtils.getCommonsHttpSolrServer(jobConf);
            SolrInputSplit solrInputSplit = (SolrInputSplit) inputSplit;
            final int numDocs = solrInputSplit.getNumDocs();
            SolrQuery solrQuery = new SolrQuery(SolrDeleteDuplicates.SOLR_GET_ALL_QUERY);
            solrQuery.setFields(new String[]{SolrConstants.ID_FIELD, SolrConstants.BOOST_FIELD, SolrConstants.TIMESTAMP_FIELD, "digest"});
            solrQuery.setStart(Integer.valueOf(solrInputSplit.getDocBegin()));
            solrQuery.setRows(Integer.valueOf(numDocs));
            try {
                final SolrDocumentList results = commonsHttpSolrServer.query(solrQuery).getResults();
                return new RecordReader<Text, SolrRecord>() { // from class: org.apache.nutch.indexer.solr.SolrDeleteDuplicates.SolrInputFormat.1
                    private int currentDoc = 0;

                    @Override // org.apache.hadoop.mapred.RecordReader
                    public void close() throws IOException {
                    }

                    /* JADX WARN: Can't rename method to resolve collision */
                    @Override // org.apache.hadoop.mapred.RecordReader
                    public Text createKey() {
                        return new Text();
                    }

                    /* JADX WARN: Can't rename method to resolve collision */
                    @Override // org.apache.hadoop.mapred.RecordReader
                    public SolrRecord createValue() {
                        return new SolrRecord();
                    }

                    @Override // org.apache.hadoop.mapred.RecordReader
                    public long getPos() throws IOException {
                        return this.currentDoc;
                    }

                    @Override // org.apache.hadoop.mapred.RecordReader
                    public float getProgress() throws IOException {
                        return this.currentDoc / numDocs;
                    }

                    @Override // org.apache.hadoop.mapred.RecordReader
                    public boolean next(Text text, SolrRecord solrRecord) throws IOException {
                        if (this.currentDoc >= numDocs) {
                            return false;
                        }
                        SolrDocument solrDocument = (SolrDocument) results.get(this.currentDoc);
                        text.set((String) solrDocument.getFieldValue("digest"));
                        solrRecord.readSolrDocument(solrDocument);
                        this.currentDoc++;
                        return true;
                    }
                };
            } catch (SolrServerException e) {
                throw new IOException((Throwable) e);
            }
        }
    }

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/indexer/solr/SolrDeleteDuplicates$SolrInputSplit.class */
    public static class SolrInputSplit implements InputSplit {
        private int docBegin;
        private int numDocs;

        public SolrInputSplit() {
        }

        public SolrInputSplit(int i, int i2) {
            this.docBegin = i;
            this.numDocs = i2;
        }

        public int getDocBegin() {
            return this.docBegin;
        }

        public int getNumDocs() {
            return this.numDocs;
        }

        @Override // org.apache.hadoop.mapred.InputSplit
        public long getLength() throws IOException {
            return this.numDocs;
        }

        @Override // org.apache.hadoop.mapred.InputSplit
        public String[] getLocations() throws IOException {
            return new String[0];
        }

        @Override // org.apache.hadoop.io.Writable
        public void readFields(DataInput dataInput) throws IOException {
            this.docBegin = dataInput.readInt();
            this.numDocs = dataInput.readInt();
        }

        @Override // org.apache.hadoop.io.Writable
        public void write(DataOutput dataOutput) throws IOException {
            dataOutput.writeInt(this.docBegin);
            dataOutput.writeInt(this.numDocs);
        }
    }

    /* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/indexer/solr/SolrDeleteDuplicates$SolrRecord.class */
    public static class SolrRecord implements Writable {
        private float boost;
        private long tstamp;
        private String id;

        public SolrRecord() {
        }

        public SolrRecord(SolrRecord solrRecord) {
            this.id = solrRecord.id;
            this.boost = solrRecord.boost;
            this.tstamp = solrRecord.tstamp;
        }

        public SolrRecord(String str, float f, long j) {
            this.id = str;
            this.boost = f;
            this.tstamp = j;
        }

        public String getId() {
            return this.id;
        }

        public float getBoost() {
            return this.boost;
        }

        public long getTstamp() {
            return this.tstamp;
        }

        public void readSolrDocument(SolrDocument solrDocument) {
            this.id = (String) solrDocument.getFieldValue(SolrConstants.ID_FIELD);
            this.boost = ((Float) solrDocument.getFieldValue(SolrConstants.BOOST_FIELD)).floatValue();
            this.tstamp = ((Date) solrDocument.getFieldValue(SolrConstants.TIMESTAMP_FIELD)).getTime();
        }

        @Override // org.apache.hadoop.io.Writable
        public void readFields(DataInput dataInput) throws IOException {
            this.id = Text.readString(dataInput);
            this.boost = dataInput.readFloat();
            this.tstamp = dataInput.readLong();
        }

        @Override // org.apache.hadoop.io.Writable
        public void write(DataOutput dataOutput) throws IOException {
            Text.writeString(dataOutput, this.id);
            dataOutput.writeFloat(this.boost);
            dataOutput.writeLong(this.tstamp);
        }
    }

    @Override // org.apache.hadoop.conf.Configurable
    public Configuration getConf() {
        return this.conf;
    }

    @Override // org.apache.hadoop.conf.Configurable
    public void setConf(Configuration configuration) {
        this.conf = configuration;
    }

    @Override // org.apache.hadoop.mapred.JobConfigurable
    public void configure(JobConf jobConf) {
        try {
            this.solr = SolrUtils.getCommonsHttpSolrServer(jobConf);
            this.noCommit = jobConf.getBoolean("noCommit", false);
        } catch (MalformedURLException e) {
            throw new RuntimeException(e);
        }
    }

    @Override // java.io.Closeable, java.lang.AutoCloseable
    public void close() throws IOException {
        try {
            if (this.numDeletes > 0) {
                LOG.info("SolrDeleteDuplicates: deleting " + this.numDeletes + " duplicates");
                this.updateRequest.process(this.solr);
                if (!this.noCommit) {
                    this.solr.commit();
                }
            }
        } catch (SolrServerException e) {
            throw new IOException((Throwable) e);
        }
    }

    @Override // org.apache.hadoop.mapred.Reducer
    public void reduce(Text text, Iterator<SolrRecord> it, OutputCollector<Text, SolrRecord> outputCollector, Reporter reporter) throws IOException {
        SolrRecord solrRecord = new SolrRecord(it.next());
        while (it.hasNext()) {
            SolrRecord next = it.next();
            if (next.getBoost() > solrRecord.getBoost() || (next.getBoost() == solrRecord.getBoost() && next.getTstamp() > solrRecord.getTstamp())) {
                this.updateRequest.deleteById(solrRecord.id);
                solrRecord = new SolrRecord(next);
            } else {
                this.updateRequest.deleteById(next.id);
            }
            this.numDeletes++;
            reporter.incrCounter("SolrDedupStatus", "Deleted documents", 1L);
            if (this.numDeletes >= 1000) {
                try {
                    LOG.info("SolrDeleteDuplicates: deleting " + this.numDeletes + " duplicates");
                    this.updateRequest.process(this.solr);
                    this.updateRequest = new UpdateRequest();
                    this.numDeletes = 0;
                } catch (SolrServerException e) {
                    throw new IOException((Throwable) e);
                }
            }
        }
    }

    public void dedup(String str) throws IOException {
        dedup(str, false);
    }

    public void dedup(String str, boolean z) throws IOException {
        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long currentTimeMillis = System.currentTimeMillis();
        LOG.info("SolrDeleteDuplicates: starting at " + simpleDateFormat.format(Long.valueOf(currentTimeMillis)));
        LOG.info("SolrDeleteDuplicates: Solr url: " + str);
        NutchJob nutchJob = new NutchJob(getConf());
        nutchJob.set(SolrConstants.SERVER_URL, str);
        nutchJob.setBoolean("noCommit", z);
        nutchJob.setInputFormat(SolrInputFormat.class);
        nutchJob.setOutputFormat(NullOutputFormat.class);
        nutchJob.setMapOutputKeyClass(Text.class);
        nutchJob.setMapOutputValueClass(SolrRecord.class);
        nutchJob.setMapperClass(IdentityMapper.class);
        nutchJob.setReducerClass(SolrDeleteDuplicates.class);
        JobClient.runJob(nutchJob);
        long currentTimeMillis2 = System.currentTimeMillis();
        LOG.info("SolrDeleteDuplicates: finished at " + simpleDateFormat.format(Long.valueOf(currentTimeMillis2)) + ", elapsed: " + TimingUtil.elapsedTime(currentTimeMillis, currentTimeMillis2));
    }

    @Override // org.apache.hadoop.util.Tool
    public int run(String[] strArr) throws IOException {
        if (strArr.length < 1) {
            System.err.println("Usage: SolrDeleteDuplicates <solr url> [-noCommit]");
            return 1;
        }
        boolean z = false;
        if (strArr.length == 2 && strArr[1].equals("-noCommit")) {
            z = true;
        }
        dedup(strArr[0], z);
        return 0;
    }

    public static void main(String[] strArr) throws Exception {
        System.exit(ToolRunner.run(NutchConfiguration.create(), new SolrDeleteDuplicates(), strArr));
    }
}
