package org.apache.nutch.protocol.http.api;

import java.io.FileInputStream;
import java.io.FileReader;
import java.io.IOException;
import java.io.LineNumberReader;
import java.net.URL;
import java.net.URLDecoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Hashtable;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configurable;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.util.StringUtils;
import org.apache.nutch.crawl.CrawlDatum;
import org.apache.nutch.net.protocols.Response;
import org.apache.nutch.protocol.ProtocolException;
import org.apache.nutch.protocol.RobotRules;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:plugins/lib-http/lib-http.jar:org/apache/nutch/protocol/http/api/RobotRulesParser.class */
public class RobotRulesParser implements Configurable {
    private boolean allowForbidden = false;
    private static final String CHARACTER_ENCODING = "UTF-8";
    private static final int NO_PRECEDENCE = Integer.MAX_VALUE;
    private Configuration conf;
    private HashMap robotNames;
    private static final int BUFSIZE = 2048;
    public static final Logger LOG = LoggerFactory.getLogger(RobotRulesParser.class);
    private static final Hashtable CACHE = new Hashtable();
    private static final RobotRuleSet EMPTY_RULES = new RobotRuleSet();
    private static RobotRuleSet FORBID_ALL_RULES = getForbidAllRules();

    /* loaded from: input_file:plugins/lib-http/lib-http.jar:org/apache/nutch/protocol/http/api/RobotRulesParser$RobotRuleSet.class */
    public static class RobotRuleSet implements RobotRules {
        long expireTime;
        ArrayList tmpEntries = new ArrayList();
        RobotsEntry[] entries = null;
        long crawlDelay = -1;

        /* JADX INFO: Access modifiers changed from: private */
        /* loaded from: input_file:plugins/lib-http/lib-http.jar:org/apache/nutch/protocol/http/api/RobotRulesParser$RobotRuleSet$RobotsEntry.class */
        public class RobotsEntry {
            String prefix;
            boolean allowed;

            RobotsEntry(String str, boolean z) {
                this.prefix = str;
                this.allowed = z;
            }
        }

        /* JADX INFO: Access modifiers changed from: private */
        public void addPrefix(String str, boolean z) {
            if (this.tmpEntries == null) {
                this.tmpEntries = new ArrayList();
                if (this.entries != null) {
                    for (int i = 0; i < this.entries.length; i++) {
                        this.tmpEntries.add(this.entries[i]);
                    }
                }
                this.entries = null;
            }
            this.tmpEntries.add(new RobotsEntry(str, z));
        }

        /* JADX INFO: Access modifiers changed from: private */
        public void clearPrefixes() {
            if (this.tmpEntries != null) {
                this.tmpEntries.clear();
            } else {
                this.tmpEntries = new ArrayList();
                this.entries = null;
            }
        }

        public void setExpireTime(long j) {
            this.expireTime = j;
        }

        @Override // org.apache.nutch.protocol.RobotRules
        public long getExpireTime() {
            return this.expireTime;
        }

        @Override // org.apache.nutch.protocol.RobotRules
        public long getCrawlDelay() {
            return this.crawlDelay;
        }

        public void setCrawlDelay(long j) {
            this.crawlDelay = j;
        }

        @Override // org.apache.nutch.protocol.RobotRules
        public boolean isAllowed(URL url) {
            String path = url.getPath();
            if (path == null || "".equals(path)) {
                path = "/";
            }
            return isAllowed(path);
        }

        public boolean isAllowed(String str) {
            try {
                str = URLDecoder.decode(str, "UTF-8");
            } catch (Exception e) {
            }
            if (this.entries == null) {
                this.entries = new RobotsEntry[this.tmpEntries.size()];
                this.entries = (RobotsEntry[]) this.tmpEntries.toArray(this.entries);
                this.tmpEntries = null;
            }
            int length = this.entries.length;
            for (int i = 0; i < length; i++) {
                if (str.startsWith(this.entries[i].prefix)) {
                    return this.entries[i].allowed;
                }
            }
            return true;
        }

        public String toString() {
            isAllowed("x");
            StringBuffer stringBuffer = new StringBuffer();
            for (int i = 0; i < this.entries.length; i++) {
                if (this.entries[i].allowed) {
                    stringBuffer.append("Allow: " + this.entries[i].prefix + System.getProperty("line.separator"));
                } else {
                    stringBuffer.append("Disallow: " + this.entries[i].prefix + System.getProperty("line.separator"));
                }
            }
            return stringBuffer.toString();
        }
    }

    /* JADX INFO: Access modifiers changed from: package-private */
    public RobotRulesParser() {
    }

    public RobotRulesParser(Configuration configuration) {
        setConf(configuration);
    }

    @Override // org.apache.hadoop.conf.Configurable
    public void setConf(Configuration configuration) {
        this.conf = configuration;
        this.allowForbidden = configuration.getBoolean("http.robots.403.allow", false);
        configuration.get("http.agent.name");
        StringTokenizer stringTokenizer = new StringTokenizer(configuration.get("http.robots.agents"), StringUtils.COMMA_STR);
        ArrayList arrayList = new ArrayList();
        while (stringTokenizer.hasMoreTokens()) {
            arrayList.add(stringTokenizer.nextToken().trim());
        }
        setRobotNames((String[]) arrayList.toArray(new String[arrayList.size()]));
    }

    @Override // org.apache.hadoop.conf.Configurable
    public Configuration getConf() {
        return this.conf;
    }

    private void setRobotNames(String[] strArr) {
        this.robotNames = new HashMap();
        for (int i = 0; i < strArr.length; i++) {
            this.robotNames.put(strArr[i].toLowerCase(), new Integer(i));
        }
        if (this.robotNames.containsKey("*")) {
            return;
        }
        this.robotNames.put("*", new Integer(strArr.length));
    }

    RobotRulesParser(String[] strArr) {
        setRobotNames(strArr);
    }

    RobotRuleSet parseRules(byte[] bArr) {
        int intValue;
        if (bArr == null) {
            return EMPTY_RULES;
        }
        StringTokenizer stringTokenizer = new StringTokenizer(new String(bArr), "\n\r");
        RobotRuleSet robotRuleSet = null;
        int i = Integer.MAX_VALUE;
        RobotRuleSet robotRuleSet2 = new RobotRuleSet();
        int i2 = Integer.MAX_VALUE;
        boolean z = false;
        boolean z2 = false;
        while (stringTokenizer.hasMoreTokens()) {
            String nextToken = stringTokenizer.nextToken();
            int indexOf = nextToken.indexOf("#");
            if (indexOf >= 0) {
                nextToken = nextToken.substring(0, indexOf);
            }
            String trim = nextToken.trim();
            if (trim.length() >= 11 && trim.substring(0, 11).equalsIgnoreCase("User-agent:")) {
                if (z2) {
                    if (i2 < i) {
                        i = i2;
                        robotRuleSet = robotRuleSet2;
                        i2 = Integer.MAX_VALUE;
                        robotRuleSet2 = new RobotRuleSet();
                    }
                    z = false;
                }
                z2 = false;
                StringTokenizer stringTokenizer2 = new StringTokenizer(trim.substring(trim.indexOf(":") + 1).trim());
                while (stringTokenizer2.hasMoreTokens()) {
                    Integer num = (Integer) this.robotNames.get(stringTokenizer2.nextToken().toLowerCase());
                    if (num != null && (intValue = num.intValue()) < i2 && intValue < i) {
                        i2 = intValue;
                    }
                }
                if (i2 < i) {
                    z = true;
                }
            } else if (trim.length() >= 9 && trim.substring(0, 9).equalsIgnoreCase("Disallow:")) {
                z2 = true;
                String trim2 = trim.substring(trim.indexOf(":") + 1).trim();
                try {
                    trim2 = URLDecoder.decode(trim2, "UTF-8");
                } catch (Exception e) {
                    if (LOG.isWarnEnabled()) {
                        LOG.warn("error parsing robots rules- can't decode path: " + trim2);
                    }
                }
                if (trim2.length() == 0) {
                    if (z) {
                        robotRuleSet2.clearPrefixes();
                    }
                } else if (z) {
                    robotRuleSet2.addPrefix(trim2, false);
                }
            } else if (trim.length() >= 6 && trim.substring(0, 6).equalsIgnoreCase("Allow:")) {
                z2 = true;
                String trim3 = trim.substring(trim.indexOf(":") + 1).trim();
                if (trim3.length() == 0) {
                    if (z) {
                        robotRuleSet2.clearPrefixes();
                    }
                } else if (z) {
                    robotRuleSet2.addPrefix(trim3, true);
                }
            } else if (trim.length() >= 12 && trim.substring(0, 12).equalsIgnoreCase("Crawl-Delay:")) {
                z2 = true;
                if (z) {
                    long j = -1;
                    String trim4 = trim.substring("Crawl-Delay:".length(), trim.length()).trim();
                    if (trim4.length() > 0) {
                        try {
                            j = Long.parseLong(trim4) * 1000;
                        } catch (Exception e2) {
                            LOG.info("can not parse Crawl-Delay:" + e2.toString());
                        }
                        robotRuleSet2.setCrawlDelay(j);
                    }
                }
            }
        }
        if (i2 < i) {
            i = i2;
            robotRuleSet = robotRuleSet2;
        }
        return i == Integer.MAX_VALUE ? EMPTY_RULES : robotRuleSet;
    }

    static RobotRuleSet getEmptyRules() {
        return EMPTY_RULES;
    }

    static RobotRuleSet getForbidAllRules() {
        RobotRuleSet robotRuleSet = new RobotRuleSet();
        robotRuleSet.addPrefix("", false);
        return robotRuleSet;
    }

    public RobotRuleSet getRobotRulesSet(HttpBase httpBase, Text text) {
        try {
            return getRobotRulesSet(httpBase, new URL(text.toString()));
        } catch (Exception e) {
            return EMPTY_RULES;
        }
    }

    private RobotRuleSet getRobotRulesSet(HttpBase httpBase, URL url) {
        String lowerCase = url.getHost().toLowerCase();
        RobotRuleSet robotRuleSet = (RobotRuleSet) CACHE.get(lowerCase);
        boolean z = true;
        if (robotRuleSet == null) {
            URL url2 = null;
            if (LOG.isTraceEnabled()) {
                LOG.trace("cache miss " + url);
            }
            try {
                Response response = httpBase.getResponse(new URL(url, "/robots.txt"), new CrawlDatum(), true);
                if (response.getCode() == 301 || response.getCode() == 302) {
                    String header = response.getHeader("Location");
                    if (header == null) {
                        header = response.getHeader("location");
                    }
                    if (header != null) {
                        url2 = !header.startsWith("http") ? new URL(url, header) : new URL(header);
                        response = httpBase.getResponse(url2, new CrawlDatum(), true);
                    }
                }
                if (response.getCode() == 200) {
                    robotRuleSet = parseRules(response.getContent());
                } else if (response.getCode() == 403 && !this.allowForbidden) {
                    robotRuleSet = FORBID_ALL_RULES;
                } else if (response.getCode() >= 500) {
                    z = false;
                    robotRuleSet = EMPTY_RULES;
                } else {
                    robotRuleSet = EMPTY_RULES;
                }
            } catch (Throwable th) {
                if (LOG.isInfoEnabled()) {
                    LOG.info("Couldn't get robots.txt for " + url + ": " + th.toString());
                }
                z = false;
                robotRuleSet = EMPTY_RULES;
            }
            if (z) {
                CACHE.put(lowerCase, robotRuleSet);
                if (url2 != null && !url2.getHost().equals(lowerCase)) {
                    CACHE.put(url2.getHost(), robotRuleSet);
                }
            }
        }
        return robotRuleSet;
    }

    public boolean isAllowed(HttpBase httpBase, URL url) throws ProtocolException, IOException {
        String path = url.getPath();
        if (path == null || "".equals(path)) {
            path = "/";
        }
        return getRobotRulesSet(httpBase, url).isAllowed(path);
    }

    public long getCrawlDelay(HttpBase httpBase, URL url) throws ProtocolException, IOException {
        return getRobotRulesSet(httpBase, url).getCrawlDelay();
    }

    public static void main(String[] strArr) {
        if (strArr.length < 3) {
            System.out.println("Usage:");
            System.out.println("   java <robots-file> <url-file> <agent-name>+");
            System.out.println("");
            System.out.println("The <robots-file> will be parsed as a robots.txt file,");
            System.out.println("using the given <agent-name> to select rules.  URLs ");
            System.out.println("will be read (one per line) from <url-file>, and tested");
            System.out.println("against the rules.");
            System.exit(-1);
        }
        try {
            FileInputStream fileInputStream = new FileInputStream(strArr[0]);
            LineNumberReader lineNumberReader = new LineNumberReader(new FileReader(strArr[1]));
            String[] strArr2 = new String[strArr.length - 2];
            for (int i = 0; i < strArr.length - 2; i++) {
                strArr2[i] = strArr[i + 2];
            }
            ArrayList arrayList = new ArrayList();
            byte[] bArr = new byte[2048];
            int i2 = 0;
            for (int read = fileInputStream.read(bArr); read >= 0; read = fileInputStream.read(bArr)) {
                i2 += read;
                if (read != 2048) {
                    byte[] bArr2 = new byte[read];
                    System.arraycopy(bArr, 0, bArr2, 0, read);
                    arrayList.add(bArr2);
                } else {
                    arrayList.add(bArr);
                    bArr = new byte[2048];
                }
            }
            byte[] bArr3 = new byte[i2];
            int i3 = 0;
            for (int i4 = 0; i4 < arrayList.size(); i4++) {
                byte[] bArr4 = (byte[]) arrayList.get(i4);
                int length = bArr4.length;
                System.arraycopy(bArr4, 0, bArr3, i3, length);
                i3 += length;
            }
            RobotRuleSet parseRules = new RobotRulesParser(strArr2).parseRules(bArr3);
            System.out.println("Rules:");
            System.out.println(parseRules);
            System.out.println();
            for (String trim = lineNumberReader.readLine().trim(); trim != null; trim = lineNumberReader.readLine()) {
                System.out.println((parseRules.isAllowed(trim) ? "allowed" : "not allowed") + ":\t" + trim);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
    }
}
