html解析 jsoup使用介绍 jsoup解析html

2016-03-31 22:54:00
admin
原创 2139
摘要:html解析 jsoup使用介绍 jsoup解析html

一、jsoup做什么

jsoup is a Java library for working with real-world HTML. It provides a very convenient API for extracting and manipulating data, using the best of DOM, CSS, and jquery-like methods.


二、jsoup的maven配置

   <dependency>
  <groupId>org.jsoup</groupId>
  <artifactId>jsoup</artifactId>
  <version>1.8.3</version>
  </dependency>


三、index.html的内容

<!DOCTYPE html>
<html>
<head>
<meta charset="UTF-8">
<title>jsoup use</title>
</head>
<body>
first&nbsp;demo:
show
<div>
<p>how to use jsoup?</p>
</div>
</body>
</html>


四、解析html代码示例

代码下载(浏览器打开可能乱码,请直接下载然后打开):HtmlParser.java

工程文件:html.rar


import java.io.*;
import org.jsoup.*;
import org.jsoup.nodes.*;
import org.jsoup.select.*;
import org.jsoup.parser.*;
import org.jsoup.helper.*;

public class HtmlParser {

public static class TextVisitor implements NodeVisitor {

//单线程一般用StringBuilder,多线程一般用StringBuffer
private StringBuilder text = new StringBuilder();
private boolean showBlock;

public TextVisitor(boolean showBlock) {
this.showBlock = showBlock;
}

public String getText() {
return text.toString().trim();
}

public static boolean preserveWhitespace(Node node) {
       if (node != null && node instanceof Element) {
           Element element = (Element)node;
           return element.tag().preserveWhitespace() ||
               element.parent() != null && element.parent().tag().preserveWhitespace();
       }
       return false;
   }

public static boolean lastCharIsSpace(StringBuilder sb) {
       return sb.length() != 0 && sb.charAt(sb.length() - 1) == ' ';
   }

public static boolean lastCharIsLineBreak(StringBuilder sb) {
       return sb.length() != 0 && sb.charAt(sb.length() - 1) == '\n';
   }

private static void appendNormalisedText(StringBuilder accum, TextNode textNode) {
       String text = textNode.getWholeText();
       text = text.trim();
       if (preserveWhitespace(textNode.parent()))
           accum.append(text);
       else
           StringUtil.appendNormalisedWhitespace(accum, text, lastCharIsSpace(accum));
   }

public void head(Node node, int depth) {
if (node instanceof TextNode) {
                TextNode textNode = (TextNode)node;
                appendNormalisedText(text, textNode);
            } else if (node instanceof Element) {
                Element element = (Element)node;
                Tag tag = element.tag();
                String tagName = tag.getName();
                boolean isBlock = tag.isBlock();
                if (showBlock)
                System.out.println(tagName + " block state is " + isBlock);
                
                if (text.length() > 0
                && (isBlock || tagName.equals("br"))
                && !lastCharIsLineBreak(text))
                    text.append("\n");
            }
}

public void tail(Node node, int depth) {
}
}

public static void parseSimpleHtml() throws Exception {
InputStream input =
HtmlParser.class.getResourceAsStream("/index.html");
Document doc = Jsoup.parse(input, "utf-8", "http://www.3scard.com");

TextVisitor visitor = new TextVisitor(true);
NodeTraversor traversor = new NodeTraversor(visitor);
traversor.traverse(doc);
System.out.println(visitor.getText());
}

public static void parseComplexHtml() throws Exception {
String url = "http://www.3scard.com/index.php?m=blog&f=index";
Document doc = Jsoup.connect(url).get();
System.out.println(doc.text());

TextVisitor visitor = new TextVisitor(false);
NodeTraversor traversor = new NodeTraversor(visitor);
traversor.traverse(doc);
System.out.println(visitor.getText());
}

public static void main(String[] args) throws Exception {
parseSimpleHtml();
parseComplexHtml();
}
}

发表评论
评论通过审核之后才会显示。