package org.apache.nutch.crawl;

import java.io.IOException;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Random;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapred.FileInputFormat;
import org.apache.hadoop.mapred.FileOutputFormat;
import org.apache.hadoop.mapred.JobClient;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapred.MapFileOutputFormat;
import org.apache.hadoop.mapred.Mapper;
import org.apache.hadoop.mapred.OutputCollector;
import org.apache.hadoop.mapred.Reporter;
import org.apache.hadoop.mapred.SequenceFileInputFormat;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.net.URLFilters;
import org.apache.nutch.net.URLNormalizers;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.parse.ParseData;
import org.apache.nutch.util.HadoopFSUtil;
import org.apache.nutch.util.LockUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.TimingUtil;
import org.apache.tools.ant.taskdefs.optional.clearcase.CCRmtype;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:nutch-1.5.1.jar:org/apache/nutch/crawl/LinkDb.class */
public class LinkDb extends Configured implements Tool, Mapper<Text, ParseData, Text, Inlinks> {
    public static final Logger LOG = LoggerFactory.getLogger(LinkDb.class);
    public static final String IGNORE_INTERNAL_LINKS = "db.ignore.internal.links";
    public static final String CURRENT_NAME = "current";
    public static final String LOCK_NAME = ".locked";
    private int maxAnchorLength;
    private boolean ignoreInternalLinks;
    private URLFilters urlFilters;
    private URLNormalizers urlNormalizers;

    public LinkDb() {
    }

    public LinkDb(Configuration configuration) {
        setConf(configuration);
    }

    @Override // org.apache.hadoop.mapred.JobConfigurable
    public void configure(JobConf jobConf) {
        this.maxAnchorLength = jobConf.getInt("db.max.anchor.length", 100);
        this.ignoreInternalLinks = jobConf.getBoolean(IGNORE_INTERNAL_LINKS, true);
        if (jobConf.getBoolean(LinkDbFilter.URL_FILTERING, false)) {
            this.urlFilters = new URLFilters(jobConf);
        }
        if (jobConf.getBoolean(LinkDbFilter.URL_NORMALIZING, false)) {
            this.urlNormalizers = new URLNormalizers(jobConf, URLNormalizers.SCOPE_LINKDB);
        }
    }

    @Override // java.io.Closeable, java.lang.AutoCloseable
    public void close() {
    }

    @Override // org.apache.hadoop.mapred.Mapper
    public void map(Text text, ParseData parseData, OutputCollector<Text, Inlinks> outputCollector, Reporter reporter) throws IOException {
        String host;
        String text2 = text.toString();
        String host2 = getHost(text2);
        if (this.urlNormalizers != null) {
            try {
                text2 = this.urlNormalizers.normalize(text2, URLNormalizers.SCOPE_LINKDB);
            } catch (Exception e) {
                LOG.warn("Skipping " + text2 + ":" + e);
                text2 = null;
            }
        }
        if (text2 != null && this.urlFilters != null) {
            try {
                text2 = this.urlFilters.filter(text2);
            } catch (Exception e2) {
                LOG.warn("Skipping " + text2 + ":" + e2);
                text2 = null;
            }
        }
        if (text2 == null) {
            return;
        }
        Outlink[] outlinks = parseData.getOutlinks();
        Inlinks inlinks = new Inlinks();
        for (Outlink outlink : outlinks) {
            String toUrl = outlink.getToUrl();
            if (!this.ignoreInternalLinks || ((host = getHost(toUrl)) != null && !host.equals(host2))) {
                if (this.urlNormalizers != null) {
                    try {
                        toUrl = this.urlNormalizers.normalize(toUrl, URLNormalizers.SCOPE_LINKDB);
                    } catch (Exception e3) {
                        LOG.warn("Skipping " + toUrl + ":" + e3);
                        toUrl = null;
                    }
                }
                if (toUrl != null && this.urlFilters != null) {
                    try {
                        toUrl = this.urlFilters.filter(toUrl);
                    } catch (Exception e4) {
                        LOG.warn("Skipping " + toUrl + ":" + e4);
                        toUrl = null;
                    }
                }
                if (toUrl != null) {
                    inlinks.clear();
                    String anchor = outlink.getAnchor();
                    if (anchor.length() > this.maxAnchorLength) {
                        anchor = anchor.substring(0, this.maxAnchorLength);
                    }
                    inlinks.add(new Inlink(text2, anchor));
                    outputCollector.collect(new Text(toUrl), inlinks);
                }
            }
        }
    }

    private String getHost(String str) {
        try {
            return new URL(str).getHost().toLowerCase();
        } catch (MalformedURLException e) {
            return null;
        }
    }

    public void invert(Path path, Path path2, boolean z, boolean z2, boolean z3) throws IOException {
        FileSystem fileSystem = FileSystem.get(getConf());
        invert(path, HadoopFSUtil.getPaths(fileSystem.listStatus(path2, HadoopFSUtil.getPassDirectoriesFilter(fileSystem))), z, z2, z3);
    }

    public void invert(Path path, Path[] pathArr, boolean z, boolean z2, boolean z3) throws IOException {
        JobConf createJob = createJob(getConf(), path, z, z2);
        Path path2 = new Path(path, ".locked");
        FileSystem fileSystem = FileSystem.get(getConf());
        LockUtil.createLockFile(fileSystem, path2, z3);
        Path path3 = new Path(path, "current");
        SimpleDateFormat simpleDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
        long currentTimeMillis = System.currentTimeMillis();
        if (LOG.isInfoEnabled()) {
            LOG.info("LinkDb: starting at " + simpleDateFormat.format(Long.valueOf(currentTimeMillis)));
            LOG.info("LinkDb: linkdb: " + path);
            LOG.info("LinkDb: URL normalize: " + z);
            LOG.info("LinkDb: URL filter: " + z2);
            if (createJob.getBoolean(IGNORE_INTERNAL_LINKS, true)) {
                LOG.info("LinkDb: internal links will be ignored.");
            }
        }
        for (int i = 0; i < pathArr.length; i++) {
            if (LOG.isInfoEnabled()) {
                LOG.info("LinkDb: adding segment: " + pathArr[i]);
            }
            FileInputFormat.addInputPath(createJob, new Path(pathArr[i], ParseData.DIR_NAME));
        }
        try {
            JobClient.runJob(createJob);
            if (fileSystem.exists(path3)) {
                if (LOG.isInfoEnabled()) {
                    LOG.info("LinkDb: merging with existing linkdb: " + path);
                }
                Path outputPath = FileOutputFormat.getOutputPath(createJob);
                createJob = LinkDbMerger.createMergeJob(getConf(), path, z, z2);
                FileInputFormat.addInputPath(createJob, path3);
                FileInputFormat.addInputPath(createJob, outputPath);
                try {
                    JobClient.runJob(createJob);
                    fileSystem.delete(outputPath, true);
                } catch (IOException e) {
                    LockUtil.removeLockFile(fileSystem, path2);
                    fileSystem.delete(outputPath, true);
                    throw e;
                }
            }
            install(createJob, path);
            long currentTimeMillis2 = System.currentTimeMillis();
            LOG.info("LinkDb: finished at " + simpleDateFormat.format(Long.valueOf(currentTimeMillis2)) + ", elapsed: " + TimingUtil.elapsedTime(currentTimeMillis, currentTimeMillis2));
        } catch (IOException e2) {
            LockUtil.removeLockFile(fileSystem, path2);
            throw e2;
        }
    }

    private static JobConf createJob(Configuration configuration, Path path, boolean z, boolean z2) {
        Path path2 = new Path("linkdb-" + Integer.toString(new Random().nextInt(Integer.MAX_VALUE)));
        NutchJob nutchJob = new NutchJob(configuration);
        nutchJob.setJobName("linkdb " + path);
        nutchJob.setInputFormat(SequenceFileInputFormat.class);
        nutchJob.setMapperClass(LinkDb.class);
        nutchJob.setCombinerClass(LinkDbMerger.class);
        if (z || z2) {
            try {
                if (!FileSystem.get(configuration).exists(path)) {
                    nutchJob.setBoolean(LinkDbFilter.URL_FILTERING, z2);
                    nutchJob.setBoolean(LinkDbFilter.URL_NORMALIZING, z);
                }
            } catch (Exception e) {
                LOG.warn("LinkDb createJob: " + e);
            }
        }
        nutchJob.setReducerClass(LinkDbMerger.class);
        FileOutputFormat.setOutputPath(nutchJob, path2);
        nutchJob.setOutputFormat(MapFileOutputFormat.class);
        nutchJob.setBoolean("mapred.output.compress", true);
        nutchJob.setOutputKeyClass(Text.class);
        nutchJob.setOutputValueClass(Inlinks.class);
        return nutchJob;
    }

    public static void install(JobConf jobConf, Path path) throws IOException {
        Path outputPath = FileOutputFormat.getOutputPath(jobConf);
        FileSystem fs = new JobClient(jobConf).getFs();
        Path path2 = new Path(path, "old");
        Path path3 = new Path(path, "current");
        if (fs.exists(path3)) {
            if (fs.exists(path2)) {
                fs.delete(path2, true);
            }
            fs.rename(path3, path2);
        }
        fs.mkdirs(path);
        fs.rename(outputPath, path3);
        if (fs.exists(path2)) {
            fs.delete(path2, true);
        }
        LockUtil.removeLockFile(fs, new Path(path, ".locked"));
    }

    public static void main(String[] strArr) throws Exception {
        System.exit(ToolRunner.run(NutchConfiguration.create(), new LinkDb(), strArr));
    }

    @Override // org.apache.hadoop.util.Tool
    public int run(String[] strArr) throws Exception {
        if (strArr.length < 2) {
            System.err.println("Usage: LinkDb <linkdb> (-dir <segmentsDir> | <seg1> <seg2> ...) [-force] [-noNormalize] [-noFilter]");
            System.err.println("\tlinkdb\toutput LinkDb to create or update");
            System.err.println("\t-dir segmentsDir\tparent directory of several segments, OR");
            System.err.println("\tseg1 seg2 ...\t list of segment directories");
            System.err.println("\t-force\tforce update even if LinkDb appears to be locked (CAUTION advised)");
            System.err.println("\t-noNormalize\tdon't normalize link URLs");
            System.err.println("\t-noFilter\tdon't apply URLFilters to link URLs");
            return -1;
        }
        FileSystem fileSystem = FileSystem.get(getConf());
        Path path = new Path(strArr[0]);
        ArrayList arrayList = new ArrayList();
        boolean z = true;
        boolean z2 = true;
        boolean z3 = false;
        int i = 1;
        while (i < strArr.length) {
            if (strArr[i].equals("-dir")) {
                i++;
                arrayList.addAll(Arrays.asList(HadoopFSUtil.getPaths(fileSystem.listStatus(new Path(strArr[i]), HadoopFSUtil.getPassDirectoriesFilter(fileSystem)))));
            } else if (strArr[i].equalsIgnoreCase("-noNormalize")) {
                z2 = false;
            } else if (strArr[i].equalsIgnoreCase("-noFilter")) {
                z = false;
            } else if (strArr[i].equalsIgnoreCase(CCRmtype.FLAG_FORCE)) {
                z3 = true;
            } else {
                arrayList.add(new Path(strArr[i]));
            }
            i++;
        }
        try {
            invert(path, (Path[]) arrayList.toArray(new Path[arrayList.size()]), z2, z, z3);
            return 0;
        } catch (Exception e) {
            LOG.error("LinkDb: " + StringUtils.stringifyException(e));
            return -1;
        }
    }
}
