返回信息流先上抓取到的数据:
附件(146.3KB) byrToptenContent.txt
控制台输出是这样的,跟上面这个附件的txt差不多。内容就是十大帖子,包括帖子名和所有评论信息。
源码在这里~刚学java,匆忙写完,欢迎大家指导~~
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URISyntaxException;
import java.util.Vector;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.http.client.ClientProtocolException;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.util.EntityUtils;
import org.jsoup.Jsoup;
import org.jsoup.select.Elements;
public class BYR {
private static String toptenURL = "http://m.byr.cn/";
private String getHTML(String url) throws ClientProtocolException, IOException {
// TODO Auto-generated method stub
String html;
CloseableHttpClient httpClient = HttpClients.createDefault();
HttpGet httpGet = new HttpGet(url);
CloseableHttpResponse response = httpClient.execute(httpGet);
html = EntityUtils.toString(response.getEntity());
//System.out.println(html);
return html;
}
private Vector<String> parseToptenHTML(String html) {
// TODO Auto-generated method stub
Vector<String> vectorLinks = new Vector<String>();
org.jsoup.nodes.Document document = Jsoup.parse(html, "gbk");
Elements links = document.select("a[href]");
int linksNum = 0;
for (org.jsoup.nodes.Element link : links) {//同名包太麻烦了,带全名就好
if(linksNum < 10){//http://m.byr.cn/article/Friends/1484644
String s = link.toString();
vectorLinks.add(toptenURL+s.toString().substring(s.indexOf("href=")+7, s.indexOf(">")-1));
System.out.println(toptenURL+s.toString().substring(s.indexOf("href=")+7, s.indexOf(">")-1));
}
else{
break;
}
linksNum++;
}
return vectorLinks;
}
public Vector<String> parseArticleHTML(String url) throws IOException{//p=32">尾页
Vector<String> vectorComments = new Vector<String>();
Vector<String> vectorUsers = new Vector<String>();
//Vector<String> vectorCommentsText = new Vector<String>();
BYR byr = new BYR();
String html = byr.getHTML(url);
Pattern p = Pattern.compile("下页.+?尾页");
Matcher m = p.matcher(html);
int pageNum = 0;
if(m.find()){
String pageNumStrText = m.group();
//System.out.println(pageNumStrText);
String pageNumStr = pageNumStrText.substring(pageNumStrText.indexOf("p=")+2, pageNumStrText.indexOf("尾页")-2);
//System.out.println(pageNumStr);
pageNum = Integer.parseInt(pageNumStr);
System.out.println("此贴共有 "+pageNum+ "页");
}
int floor = 0;
for(int page = 1; page < pageNum+1; page++){
String html2 = byr.getHTML(url+"?p="+page);
org.jsoup.nodes.Document document2 = Jsoup.parse(html2);//同一文件中不同包同名类,写全包名即可
if(page == 1){
Elements comment = document2.select("li[class=f]");
//System.out.println(comment.text());
vectorComments.add(comment.text());
}
Elements comments2 = document2.select("div[class=sp]");
Elements comments3 = document2.select("a[href~=.+user.+]");
for (org.jsoup.nodes.Element comment3 : comments3) {//同名包太麻烦了,带全名就好
String commentText3 = comment3.text();
vectorUsers.add(commentText3.toString());
//System.out.println(commentText3.toString());
}
//System.out.println("user number "+vectorUsers.size());
for (org.jsoup.nodes.Element comment2 : comments2) {//同名包太麻烦了,带全名就好
String commentText2 = comment2.text();
if(floor == 0){
//vectorCommentsText.add(commentText2);
vectorComments.add("楼主: "+vectorUsers.get(floor)+" "+commentText2);
System.out.println("楼主: "+vectorUsers.get(floor)+" "+commentText2);
floor++;
}
else{
//vectorCommentsText.add(commentText2);
vectorComments.add(floor+"楼: "+vectorUsers.get(floor)+" "+commentText2);
System.out.println(floor+"楼: "+vectorUsers.get(floor)+" "+commentText2);
floor++;
}
}
//System.out.println(" number "+vectorCommentsText.size());
}
return vectorComments;
}
public static void main(String[] args) throws ClientProtocolException, URISyntaxException, IOException {
BYR byr = new BYR();
String html = byr.getHTML(toptenURL);
Vector<String> vectorLinks = byr.parseToptenHTML(html);
//Vector<String> allContent = new Vector<String>();
File f = new File("d:/byrToptenContent.txt");
FileWriter fw = new FileWriter(f);
BufferedWriter bw = new BufferedWriter(fw);
for(int i = 0; i < vectorLinks.size(); i++){
int a = i+1;
bw.write("********以下是今天十大第 "+String.valueOf(a)+" 贴的内容********"+"\r\n");
String url = vectorLinks.get(i);
Vector<String> vectorComments = byr.parseArticleHTML(url);
for(int j = 0; j< vectorComments.size(); j++){
bw.write(vectorComments.get(j)+"\r\n");
}
bw.write("\r\n"+"\r\n");
//System.out.println();
}
bw.close();
}
}
这是一条镜像帖。来源:北邮人论坛 / java / #28193同步于 2013/12/24
该镜像源已超过 30 天没有更新,可能在源站已被删除。
Java机器人发帖
【论坛爬虫】看到有个同学想抓取十大信息,我自己动手写了一个~
hainanlxs
2013/12/24镜像同步28 回复
订阅后,新回复会通过你的通知中心匿名送达。
9 条回复
3年前写过,用xpath,不过现在应该有更优雅的方法了吧。
以前写的,当反面教材吧:https://github.com/wks/libbyr4j/blob/master/src/com/github/wks/libbyr4j/Byr.java
师兄的代码很漂亮啊,而且肯定经验丰富了,方法组织得很高端的样子,代码长实现的功能也多吧。向师兄学习~
【 在 wks 的大作中提到: 】
: 3年前写过,用xpath,不过现在应该有更优雅的方法了吧。
: 以前写的,当反面教材吧:https://github.com/wks/libbyr4j/blob/master/src/com/github/wks/libbyr4j/Byr.java
暖姐姐居然是师兄,累觉不爱~
【 在 wks (cloverprince) 的大作中提到: 】
: 3年前写过,用xpath,不过现在应该有更优雅的方法了吧。
: 以前写的,当反面教材吧:https://github.com/wks/libbyr4j/blob/master/src/com/github/wks/libbyr4j/Byr.java