package io.github.sparqlanything.html;

import com.microsoft.playwright.Browser;
import com.microsoft.playwright.Frame;
import com.microsoft.playwright.Page;
import com.microsoft.playwright.Playwright;
import io.github.sparqlanything.model.FacadeXGraphBuilder;
import io.github.sparqlanything.model.IRIArgument;
import io.github.sparqlanything.model.PropertyUtils;
import io.github.sparqlanything.model.Triplifier;
import io.github.sparqlanything.model.TriplifierHTTPException;
import java.io.IOException;
import java.io.InputStream;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.file.Paths;
import java.util.HashMap;
import java.util.Iterator;
import java.util.Map;
import java.util.Objects;
import java.util.Properties;
import java.util.Set;
import java.util.concurrent.TimeUnit;
import org.apache.any23.Any23;
import org.apache.any23.extractor.ExtractionException;
import org.apache.any23.source.DocumentSource;
import org.apache.any23.writer.TripleHandlerException;
import org.apache.jena.ext.com.google.common.collect.Sets;
import org.jsoup.Jsoup;
import org.jsoup.internal.StringUtil;
import org.jsoup.nodes.Attribute;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.nodes.Node;
import org.jsoup.select.Elements;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/* loaded from: input_file:io/github/sparqlanything/html/HTMLTriplifier.class */
public class HTMLTriplifier implements Triplifier {
    public static final String PROPERTY_METADATA = "html.metadata";
    private static final Logger log = LoggerFactory.getLogger(HTMLTriplifier.class);
    private static final String PROPERTY_SELECTOR = "html.selector";
    private static final String PROPERTY_BROWSER = "html.browser";
    private static final String PROPERTY_BROWSER_WAIT = "html.browser.wait";
    private static final String PROPERTY_BROWSER_SCREENSHOT = "html.browser.screenshot";
    private static final String PROPERTY_BROWSER_TIMEOUT = "html.browser.timeout";
    private static final String HTML_NS = "http://www.w3.org/1999/xhtml#";
    private static final String DOM_NS = "https://html.spec.whatwg.org/#";

    private static String localName(Element element) {
        StringBuilder sb = new StringBuilder(element.tagName().replace(':', '|'));
        String join = StringUtil.join(element.classNames(), ".");
        if (join.length() > 0) {
            sb.append('.').append(join);
        }
        if (element.parent() != null && !(element.parent() instanceof Document)) {
            sb.insert(0, " > ");
            if (element.parent().select(sb.toString()).size() > 1) {
                sb.append(String.format(":nth-child(%d)", Integer.valueOf(element.elementSiblingIndex() + 1)));
            }
            sb.insert(0, localName(element.parent()));
        }
        return sb.toString().replaceAll(" > ", "/").replaceAll(":nth-child\\(([0-9]+)\\)", ":$1");
    }

    @Override // io.github.sparqlanything.model.Triplifier
    public void triplify(Properties properties, FacadeXGraphBuilder facadeXGraphBuilder) throws IOException, TriplifierHTTPException {
        Document parse;
        Charset charsetArgument = Triplifier.getCharsetArgument(properties);
        boolean booleanProperty = PropertyUtils.getBooleanProperty(properties, IRIArgument.BLANK_NODES);
        String stringProperty = PropertyUtils.getStringProperty(properties, IRIArgument.NAMESPACE);
        String property = properties.getProperty(PROPERTY_SELECTOR, ":root");
        log.trace(properties.toString());
        if (properties.containsKey(PROPERTY_METADATA) && Boolean.parseBoolean(properties.getProperty(PROPERTY_METADATA))) {
            log.trace("Extracting metadata (needs HTTP location)");
            try {
                extractMetadata((URL) Objects.requireNonNull(Triplifier.getLocation(properties)), facadeXGraphBuilder);
            } catch (IOException | URISyntaxException | ExtractionException | TripleHandlerException e) {
                log.error(e.getMessage());
            }
        }
        log.trace("namespace {}\n root {}\ncharset {}\nselector {}", stringProperty, "", charsetArgument, property);
        URL location = Triplifier.getLocation(properties);
        if (properties.containsKey(PROPERTY_BROWSER)) {
            log.debug("Browser used (needs an HTTP location): {}", location);
            log.debug("Loading URL: {}", location);
            parse = Jsoup.parse(useBrowserToNavigate(((URL) Objects.requireNonNull(location)).toString(), properties));
        } else {
            InputStream inputStream = Triplifier.getInputStream(properties);
            try {
                parse = Jsoup.parse(inputStream, charsetArgument.toString(), Triplifier.getResourceId(properties));
                if (inputStream != null) {
                    inputStream.close();
                }
            } catch (Throwable th) {
                if (inputStream != null) {
                    try {
                        inputStream.close();
                    } catch (Throwable th2) {
                        th.addSuppressed(th2);
                    }
                }
                throw th;
            }
        }
        Elements select = parse.select(property);
        String str = null;
        if (select.size() > 1) {
            str = "";
            facadeXGraphBuilder.addRoot("");
        }
        int i = 0;
        Iterator<Element> it = select.iterator();
        while (it.hasNext()) {
            Element next = it.next();
            i++;
            String resourceId = toResourceId(next, booleanProperty, facadeXGraphBuilder, "");
            if (select.size() > 1) {
                facadeXGraphBuilder.addContainer("", str, Integer.valueOf(i), toResourceId(next, booleanProperty, facadeXGraphBuilder, ""));
            } else {
                str = "";
                resourceId = "";
                facadeXGraphBuilder.addRoot("");
            }
            try {
                populate(facadeXGraphBuilder, "", next, booleanProperty, resourceId);
            } catch (URISyntaxException e2) {
                throw new IOException(e2);
            }
        }
    }

    @Override // io.github.sparqlanything.model.Triplifier
    public Set<String> getMimeTypes() {
        return Sets.newHashSet("text/html");
    }

    @Override // io.github.sparqlanything.model.Triplifier
    public Set<String> getExtensions() {
        return Sets.newHashSet("html");
    }

    private void extractMetadata(URL url, FacadeXGraphBuilder facadeXGraphBuilder) throws IOException, URISyntaxException, ExtractionException, TripleHandlerException {
        Any23 any23 = new Any23();
        any23.setHTTPUserAgent("test-user-agent");
        DocumentSource createDocumentSource = any23.createDocumentSource(url.toString());
        MetadataWriter metadataWriter = new MetadataWriter(facadeXGraphBuilder);
        try {
            any23.extract(createDocumentSource, metadataWriter);
            metadataWriter.close();
        } catch (Throwable th) {
            try {
                metadataWriter.close();
            } catch (Throwable th2) {
                th.addSuppressed(th2);
            }
            throw th;
        }
    }

    private void populate(FacadeXGraphBuilder facadeXGraphBuilder, String str, Element element, boolean z, String str2) throws URISyntaxException {
        String tagName = element.tagName();
        String html = element.html();
        if (!html.trim().equals("")) {
            facadeXGraphBuilder.addValue(str, str2, new URI("https://html.spec.whatwg.org/#innerHTML"), html);
        }
        String text = element.select("*").text();
        if (!text.trim().equals("")) {
            facadeXGraphBuilder.addValue(str, str2, new URI("https://html.spec.whatwg.org/#innerText"), text);
        }
        facadeXGraphBuilder.addType(str, str2, new URI("http://www.w3.org/1999/xhtml#" + tagName));
        Iterator<Attribute> it = element.attributes().iterator();
        while (it.hasNext()) {
            Attribute next = it.next();
            facadeXGraphBuilder.addValue(str, str2, new URI("http://www.w3.org/1999/xhtml#" + next.getKey()), next.getValue());
        }
        int i = 0;
        for (Node node : element.childNodes()) {
            if (!node.outerHtml().trim().equals("")) {
                i++;
                if (node instanceof Element) {
                    facadeXGraphBuilder.addContainer(str, str2, Integer.valueOf(i), toResourceId((Element) node, z, facadeXGraphBuilder, str));
                    populate(facadeXGraphBuilder, str, (Element) node, z, toResourceId((Element) node, z, facadeXGraphBuilder, str));
                } else {
                    facadeXGraphBuilder.addValue(str, str2, Integer.valueOf(i), node.outerHtml());
                }
            }
        }
    }

    private String toResourceId(Element element, boolean z, FacadeXGraphBuilder facadeXGraphBuilder, String str) {
        if (z) {
            return Integer.toHexString(element.hashCode());
        }
        String localName = localName(element);
        log.debug(localName);
        return "/".concat(localName);
    }

    private String useBrowserToNavigate(String str, Properties properties) {
        Browser launch;
        String property = properties.getProperty(PROPERTY_BROWSER);
        HashMap hashMap = new HashMap(properties);
        HashMap hashMap2 = new HashMap();
        for (Map.Entry entry : hashMap.entrySet()) {
            if (((String) entry.getKey()).matches("^http.header..*")) {
                hashMap2.put(((String) entry.getKey()).replaceAll("^http.header.", ""), (String) entry.getValue());
            }
        }
        log.debug("HTTP headers passed to headless browser: {}", hashMap2);
        Playwright create = Playwright.create();
        boolean z = -1;
        switch (property.hashCode()) {
            case -849452327:
                if (property.equals("firefox")) {
                    z = true;
                    break;
                }
                break;
            case -791796990:
                if (property.equals("webkit")) {
                    z = 2;
                    break;
                }
                break;
            case 1920219542:
                if (property.equals("chromium")) {
                    z = false;
                    break;
                }
                break;
        }
        switch (z) {
            case false:
                log.debug("using chromium");
                launch = create.chromium().launch();
                break;
            case true:
                log.debug("using firefox");
                launch = create.firefox().launch();
                break;
            case true:
                log.debug("using webkit");
                launch = create.webkit().launch();
                break;
            default:
                log.warn("\"" + property + "\" is not a valid browser -- defaulting to chromium");
                launch = create.chromium().launch();
                break;
        }
        Page newPage = launch.newContext().newPage();
        newPage.setExtraHTTPHeaders(hashMap2);
        Page.NavigateOptions navigateOptions = new Page.NavigateOptions();
        if (properties.containsKey(PROPERTY_BROWSER_TIMEOUT)) {
            int parseInt = Integer.parseInt(properties.getProperty(PROPERTY_BROWSER_TIMEOUT));
            log.debug("headless browser navigating to url with timeout of {} milliseconds", Integer.valueOf(parseInt));
            navigateOptions.setTimeout(parseInt);
            newPage.navigate(str, navigateOptions);
        } else {
            newPage.navigate(str);
        }
        try {
            if (properties.containsKey(PROPERTY_BROWSER_WAIT)) {
                int parseInt2 = Integer.parseInt(properties.getProperty(PROPERTY_BROWSER_WAIT));
                log.debug("headless browser navigated to url and now will wait for {} seconds...", Integer.valueOf(parseInt2));
                TimeUnit.SECONDS.sleep(parseInt2);
            }
            if (properties.containsKey(PROPERTY_BROWSER_SCREENSHOT)) {
                newPage.screenshot(new Page.ScreenshotOptions().setPath(Paths.get(new URI(properties.getProperty(PROPERTY_BROWSER_SCREENSHOT)))));
            }
        } catch (Exception e) {
            System.out.println(e.getMessage());
        }
        String str2 = newPage.content() + getFrames(newPage.mainFrame());
        launch.close();
        log.debug("HTML content: {}", str2);
        return str2;
    }

    private String getFrames(Frame frame) {
        String str = "";
        if (!frame.childFrames().isEmpty()) {
            for (Frame frame2 : frame.childFrames()) {
                str = str + frame2.content() + getFrames(frame2);
            }
        }
        return str;
    }
}
