package html; import java.io.*; import org.jsoup.*; import org.jsoup.nodes.*; import org.jsoup.select.*; import org.jsoup.parser.*; import org.jsoup.helper.*; public class HtmlParser { public static class TextVisitor implements NodeVisitor { //单线程一般用StringBuilder,多线程一般用StringBuffer private StringBuilder text = new StringBuilder(); private boolean showBlock; public TextVisitor(boolean showBlock) { this.showBlock = showBlock; } public String getText() { return text.toString().trim(); } public static boolean preserveWhitespace(Node node) { if (node != null && node instanceof Element) { Element element = (Element)node; return element.tag().preserveWhitespace() || element.parent() != null && element.parent().tag().preserveWhitespace(); } return false; } public static boolean lastCharIsSpace(StringBuilder sb) { return sb.length() != 0 && sb.charAt(sb.length() - 1) == ' '; } public static boolean lastCharIsLineBreak(StringBuilder sb) { return sb.length() != 0 && sb.charAt(sb.length() - 1) == '\n'; } private static void appendNormalisedText(StringBuilder accum, TextNode textNode) { String text = textNode.getWholeText(); text = text.trim(); if (preserveWhitespace(textNode.parent())) accum.append(text); else StringUtil.appendNormalisedWhitespace(accum, text, lastCharIsSpace(accum)); } public void head(Node node, int depth) { if (node instanceof TextNode) { TextNode textNode = (TextNode)node; appendNormalisedText(text, textNode); } else if (node instanceof Element) { Element element = (Element)node; Tag tag = element.tag(); String tagName = tag.getName(); boolean isBlock = tag.isBlock(); if (showBlock) System.out.println(tagName + " block state is " + isBlock); if (text.length() > 0 && (isBlock || tagName.equals("br")) && !lastCharIsLineBreak(text)) text.append("\n"); } } public void tail(Node node, int depth) { } } public static void parseSimpleHtml() throws Exception { InputStream input = HtmlParser.class.getResourceAsStream("/index.html"); Document doc = Jsoup.parse(input, "utf-8", "http://www.3scard.com"); TextVisitor visitor = new TextVisitor(true); NodeTraversor traversor = new NodeTraversor(visitor); traversor.traverse(doc); System.out.println(visitor.getText()); } public static void parseComplexHtml() throws Exception { String url = "http://www.3scard.com/index.php?m=blog&f=index"; Document doc = Jsoup.connect(url).get(); System.out.println(doc.text()); TextVisitor visitor = new TextVisitor(false); NodeTraversor traversor = new NodeTraversor(visitor); traversor.traverse(doc); System.out.println(visitor.getText()); } public static void main(String[] args) throws Exception { parseSimpleHtml(); parseComplexHtml(); } }