package org.apache.nutch.parse.html;

import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import org.apache.fontbox.ttf.PostScriptTable;
import org.apache.hadoop.conf.Configuration;
import org.apache.nutch.parse.Outlink;
import org.apache.nutch.util.NodeWalker;
import org.apache.nutch.util.URLUtil;
import org.apache.pdfbox.pdmodel.common.PDPageLabelRange;
import org.custommonkey.xmlunit.XMLConstants;
import org.w3c.dom.NamedNodeMap;
import org.w3c.dom.Node;
import org.w3c.dom.NodeList;

/* loaded from: input_file:plugins/parse-html/parse-html.jar:org/apache/nutch/parse/html/DOMContentUtils.class */
public class DOMContentUtils {
    private HashMap<String, LinkParams> linkParams = new HashMap<>();
    private Configuration conf;

    /* loaded from: input_file:plugins/parse-html/parse-html.jar:org/apache/nutch/parse/html/DOMContentUtils$LinkParams.class */
    public static class LinkParams {
        public String elName;
        public String attrName;
        public int childLen;

        public LinkParams(String str, String str2, int i) {
            this.elName = str;
            this.attrName = str2;
            this.childLen = i;
        }

        public String toString() {
            return "LP[el=" + this.elName + ",attr=" + this.attrName + ",len=" + this.childLen + XMLConstants.XPATH_NODE_INDEX_END;
        }
    }

    public DOMContentUtils(Configuration configuration) {
        setConf(configuration);
    }

    public void setConf(Configuration configuration) {
        ArrayList arrayList = new ArrayList(1);
        this.conf = configuration;
        this.linkParams.clear();
        this.linkParams.put(PDPageLabelRange.STYLE_LETTERS_LOWER, new LinkParams(PDPageLabelRange.STYLE_LETTERS_LOWER, "href", 1));
        this.linkParams.put("area", new LinkParams("area", "href", 0));
        if (configuration.getBoolean("parser.html.form.use_action", true)) {
            this.linkParams.put("form", new LinkParams("form", "action", 1));
            if (configuration.get("parser.html.form.use_action") != null) {
                arrayList.add("form");
            }
        }
        this.linkParams.put("frame", new LinkParams("frame", "src", 0));
        this.linkParams.put("iframe", new LinkParams("iframe", "src", 0));
        this.linkParams.put("script", new LinkParams("script", "src", 0));
        this.linkParams.put("link", new LinkParams("link", "href", 0));
        this.linkParams.put("img", new LinkParams("img", "src", 0));
        String[] strings = configuration.getStrings("parser.html.outlinks.ignore_tags");
        for (int i = 0; strings != null && i < strings.length; i++) {
            if (!arrayList.contains(strings[i])) {
                this.linkParams.remove(strings[i]);
            }
        }
    }

    public boolean getText(StringBuffer stringBuffer, Node node, boolean z) {
        return getTextHelper(stringBuffer, node, z, 0);
    }

    public void getText(StringBuffer stringBuffer, Node node) {
        getText(stringBuffer, node, false);
    }

    private boolean getTextHelper(StringBuffer stringBuffer, Node node, boolean z, int i) {
        boolean z2 = false;
        NodeWalker nodeWalker = new NodeWalker(node);
        while (true) {
            if (!nodeWalker.hasNext()) {
                break;
            }
            Node nextNode = nodeWalker.nextNode();
            String nodeName = nextNode.getNodeName();
            short nodeType = nextNode.getNodeType();
            if ("script".equalsIgnoreCase(nodeName)) {
                nodeWalker.skipChildren();
            }
            if ("style".equalsIgnoreCase(nodeName)) {
                nodeWalker.skipChildren();
            }
            if (z && PDPageLabelRange.STYLE_LETTERS_LOWER.equalsIgnoreCase(nodeName)) {
                i++;
                if (i > 1) {
                    z2 = true;
                    break;
                }
            }
            if (nodeType == 8) {
                nodeWalker.skipChildren();
            }
            if (nodeType == 3) {
                String trim = nextNode.getNodeValue().replaceAll("\\s+", " ").trim();
                if (trim.length() > 0) {
                    if (stringBuffer.length() > 0) {
                        stringBuffer.append(' ');
                    }
                    stringBuffer.append(trim);
                }
            }
        }
        return z2;
    }

    public boolean getTitle(StringBuffer stringBuffer, Node node) {
        NodeWalker nodeWalker = new NodeWalker(node);
        while (nodeWalker.hasNext()) {
            Node nextNode = nodeWalker.nextNode();
            String nodeName = nextNode.getNodeName();
            short nodeType = nextNode.getNodeType();
            if ("body".equalsIgnoreCase(nodeName)) {
                return false;
            }
            if (nodeType == 1 && "title".equalsIgnoreCase(nodeName)) {
                getText(stringBuffer, nextNode);
                return true;
            }
        }
        return false;
    }

    public URL getBase(Node node) {
        NodeWalker nodeWalker = new NodeWalker(node);
        while (nodeWalker.hasNext()) {
            Node nextNode = nodeWalker.nextNode();
            String nodeName = nextNode.getNodeName();
            if (nextNode.getNodeType() == 1) {
                if ("body".equalsIgnoreCase(nodeName)) {
                    return null;
                }
                if ("base".equalsIgnoreCase(nodeName)) {
                    NamedNodeMap attributes = nextNode.getAttributes();
                    for (int i = 0; i < attributes.getLength(); i++) {
                        Node item = attributes.item(i);
                        if ("href".equalsIgnoreCase(item.getNodeName())) {
                            try {
                                return new URL(item.getNodeValue());
                            } catch (MalformedURLException e) {
                            }
                        }
                    }
                } else {
                    continue;
                }
            }
        }
        return null;
    }

    private boolean hasOnlyWhiteSpace(Node node) {
        String nodeValue = node.getNodeValue();
        for (int i = 0; i < nodeValue.length(); i++) {
            if (!Character.isWhitespace(nodeValue.charAt(i))) {
                return false;
            }
        }
        return true;
    }

    private boolean shouldThrowAwayLink(Node node, NodeList nodeList, int i, LinkParams linkParams) {
        if (i == 0) {
            return linkParams.childLen != 0;
        }
        if (i == 1 && nodeList.item(0).getNodeType() == 1 && linkParams.elName.equalsIgnoreCase(nodeList.item(0).getNodeName())) {
            return true;
        }
        if (i != 2) {
            if (i != 3) {
                return false;
            }
            Node item = nodeList.item(0);
            Node item2 = nodeList.item(1);
            Node item3 = nodeList.item(2);
            return item2.getNodeType() == 1 && linkParams.elName.equalsIgnoreCase(item2.getNodeName()) && item.getNodeType() == 3 && item3.getNodeType() == 3 && hasOnlyWhiteSpace(item) && hasOnlyWhiteSpace(item3);
        }
        Node item4 = nodeList.item(0);
        Node item5 = nodeList.item(1);
        if (item4.getNodeType() == 1 && linkParams.elName.equalsIgnoreCase(item4.getNodeName()) && item5.getNodeType() == 3 && hasOnlyWhiteSpace(item5)) {
            return true;
        }
        return item5.getNodeType() == 1 && linkParams.elName.equalsIgnoreCase(item5.getNodeName()) && item4.getNodeType() == 3 && hasOnlyWhiteSpace(item4);
    }

    public void getOutlinks(URL url, ArrayList<Outlink> arrayList, Node node) {
        Node namedItem;
        String textContent;
        String textContent2;
        NodeWalker nodeWalker = new NodeWalker(node);
        while (nodeWalker.hasNext()) {
            Node nextNode = nodeWalker.nextNode();
            String nodeName = nextNode.getNodeName();
            short nodeType = nextNode.getNodeType();
            NodeList childNodes = nextNode.getChildNodes();
            int length = childNodes != null ? childNodes.getLength() : 0;
            if (nodeType == 1) {
                LinkParams linkParams = this.linkParams.get(nodeName.toLowerCase());
                if (linkParams != null) {
                    if (!shouldThrowAwayLink(nextNode, childNodes, length, linkParams)) {
                        StringBuffer stringBuffer = new StringBuffer();
                        getText(stringBuffer, nextNode, true);
                        if (stringBuffer.toString().trim().length() == 0) {
                            NodeWalker nodeWalker2 = new NodeWalker(nextNode);
                            while (nodeWalker2.hasNext()) {
                                Node nextNode2 = nodeWalker2.nextNode();
                                if (nextNode2.getNodeType() == 1) {
                                    if (nextNode2.getNodeName().toLowerCase().equals("img") && (namedItem = nextNode2.getAttributes().getNamedItem("alt")) != null && (textContent = namedItem.getTextContent()) != null && textContent.trim().length() > 0) {
                                        if (stringBuffer.length() > 0) {
                                            stringBuffer.append(' ');
                                        }
                                        stringBuffer.append(textContent);
                                    }
                                } else if (nextNode2.getNodeType() == 3 && (textContent2 = nextNode2.getTextContent()) != null && textContent2.length() > 0) {
                                    if (stringBuffer.length() > 0) {
                                        stringBuffer.append(' ');
                                    }
                                    stringBuffer.append(textContent2);
                                }
                            }
                        }
                        NamedNodeMap attributes = nextNode.getAttributes();
                        String str = null;
                        boolean z = false;
                        boolean z2 = false;
                        for (int i = 0; i < attributes.getLength(); i++) {
                            Node item = attributes.item(i);
                            String nodeName2 = item.getNodeName();
                            if (linkParams.attrName.equalsIgnoreCase(nodeName2)) {
                                str = item.getNodeValue();
                            } else if ("rel".equalsIgnoreCase(nodeName2) && "nofollow".equalsIgnoreCase(item.getNodeValue())) {
                                z = true;
                            } else if ("method".equalsIgnoreCase(nodeName2) && PostScriptTable.TAG.equalsIgnoreCase(item.getNodeValue())) {
                                z2 = true;
                            }
                        }
                        if (str != null && !z && !z2) {
                            try {
                                arrayList.add(new Outlink(URLUtil.resolveURL(url, str).toString(), stringBuffer.toString().trim()));
                            } catch (MalformedURLException e) {
                            }
                        }
                    }
                    if (linkParams.childLen == 0) {
                    }
                }
            }
        }
    }
}
