Portál AbcLinuxu, 1. května 2025 14:02
import org.cyberneko.html.parsers.SAXParser; import org.dom4j.Document; import org.dom4j.io.SAXReader; public class Main { private final SAXReader htmlReader; private final SAXParser parser = new SAXParser(); private Document document; private Main() throws SAXNotRecognizedException, SAXNotSupportedException { prepareParser(); htmlReader = new SAXReader(parser); document = htmlReader.read(… stream s HTML…); } private void prepareParser() throws SAXNotRecognizedException, SAXNotSupportedException { parser.setFeature("http://cyberneko.org/html/features/scanner/script/strip-comment-delims", true); parser.setFeature("http://cyberneko.org/html/features/scanner/style/strip-comment-delims", true); parser.setProperty("http://cyberneko.org/html/properties/names/elems", "lower"); parser.setProperty("http://cyberneko.org/html/properties/names/attrs", "lower"); parser.setProperty("http://cyberneko.org/html/properties/default-encoding", "windows-1250"); } }Nedávno jsem narazil ještě na nový projekt HtmlCleaner, zatím jsem ho nezkoušel.
Tiskni
Sdílej:
ISSN 1214-1267, (c) 1999-2007 Stickfish s.r.o.