package de.l3s.boilerpipe.sax;

import de.l3s.boilerpipe.BoilerpipeExtractor;
import de.l3s.boilerpipe.BoilerpipeProcessingException;
import de.l3s.boilerpipe.document.TextDocument;
import java.io.IOException;
import java.net.URI;
import java.net.URISyntaxException;
import java.net.URL;
import java.util.Arrays;
import java.util.HashSet;
import java.util.Map;
import java.util.Set;
import net.htmlparser.jericho.Element;
import net.htmlparser.jericho.HTMLElementName;
import net.htmlparser.jericho.OutputDocument;
import net.htmlparser.jericho.Segment;
import net.htmlparser.jericho.Source;
import org.xml.sax.SAXException;

/* loaded from: classes.dex */
public class HtmlArticleExtractor {
    public static final HtmlArticleExtractor INSTANCE = new HtmlArticleExtractor();
    private static final Set<String> NOT_ALLOWED_HTML_TAGS = new HashSet(Arrays.asList(HTMLElementName.HEAD, "html", HTMLElementName.SCRIPT, HTMLElementName.STYLE, HTMLElementName.FORM, HTMLElementName.BODY, HTMLElementName.DIV, HTMLElementName.SPAN));

    private HtmlArticleExtractor() {
    }

    public static HtmlArticleExtractor getInstance() {
        return INSTANCE;
    }

    private String removeNotAllowedTags(String str, URI uri) {
        Source source = new Source(str);
        OutputDocument outputDocument = new OutputDocument(source);
        for (Element element : source.getAllElements()) {
            Map<String, String> replace = outputDocument.replace(element.getAttributes(), true);
            if (!element.getName().contains(HTMLElementName.A)) {
                replace.clear();
            } else if (replace.get("href") != null) {
                String str2 = replace.get("href");
                if (!str2.contains("http")) {
                    try {
                        replace.put("href", uri.resolve(new URI(str2)).toString());
                    } catch (URISyntaxException e) {
                        outputDocument.remove(element);
                    }
                }
            }
            if (NOT_ALLOWED_HTML_TAGS.contains(element.getName())) {
                Segment content = element.getContent();
                if (element.getName() == HTMLElementName.SCRIPT || element.getName() == HTMLElementName.STYLE || element.getName() == HTMLElementName.FORM) {
                    outputDocument.remove(content);
                }
                outputDocument.remove(element.getStartTag());
                if (!element.getStartTag().isSyntacticalEmptyElementTag()) {
                    outputDocument.remove(element.getEndTag());
                }
            }
        }
        return outputDocument.toString().replaceAll("\\n", "").replaceAll("\\t", "");
    }

    public String process(BoilerpipeExtractor boilerpipeExtractor, URL url) throws IOException, BoilerpipeProcessingException, SAXException, URISyntaxException {
        return process(HTMLFetcher.fetch(url), url.toURI(), boilerpipeExtractor);
    }

    public String process(HTMLDocument hTMLDocument, URI uri, BoilerpipeExtractor boilerpipeExtractor) {
        HTMLHighlighter newExtractingInstance = HTMLHighlighter.newExtractingInstance();
        newExtractingInstance.setOutputHighlightOnly(true);
        try {
            TextDocument textDocument = new BoilerpipeSAXInput(hTMLDocument.toInputSource()).getTextDocument();
            boilerpipeExtractor.process(textDocument);
            return removeNotAllowedTags(newExtractingInstance.process(textDocument, hTMLDocument.toInputSource()), uri);
        } catch (Exception e) {
            return null;
        }
    }
}
