jsoup
正常情况下jsoup不能抓取ajax请求后动态页面,只能获取静态的页面。那如果一定要抓取动态数据展示后的网页内容
怎么处理呢,这个时候,我们可以用最直接的方式,就是直接将网页内容下载到本地,然后再用jsoup抓取本地网页内容即可。
下载整个网页到本地
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.InputStreamReader;
import java.io.PrintWriter;
import java.net.HttpURLConnection;
import java.net.URL;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class TestJsoup {
public static void main(String[] args) throws Exception {
PrintWriter out = null;
out = new PrintWriter(new BufferedWriter(new FileWriter("D:\\test.html")));
URL url = new URL("https://wallhaven.cc/search?q=id%3A37435&page=2");
HttpURLConnection conn = (HttpURLConnection) url.openConnection();
conn.setRequestMethod("GET");
conn.setRequestProperty("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/79.0.3945.117 Safari/537.36");
BufferedReader br = new BufferedReader(new InputStreamReader(conn.getInputStream(),"utf-8"));
String msg = null;
while (null != (msg = br.readLine())){
out.println(msg);
}
out.close();
br.close();
File in = new File("D:\\test.html");
Document doc = Jsoup.parse(in, "UTF-8", "");
Element e = doc.getElementById("thumbs");
Elements es = e.getElementsByTag("ul");
Elements ess = e.select(".preview");
for (Element element : ess) {
System.out.println(element.attr("href"));
}
}
}