程序師世界是廣大編程愛好者互助、分享、學習的平台,程序師世界有你更精彩!
首頁
編程語言
C語言|JAVA編程
Python編程
網頁編程
ASP編程|PHP編程
JSP編程
數據庫知識
MYSQL數據庫|SqlServer數據庫
Oracle數據庫|DB2數據庫
 程式師世界 >> 編程語言 >> JAVA編程 >> 關於JAVA >> JAVA爬蟲代碼

JAVA爬蟲代碼

編輯:關於JAVA

JAVA爬蟲代碼。本站提示廣大學習愛好者:(JAVA爬蟲代碼)文章只能為提供參考,不一定能成為您想要的結果。以下是JAVA爬蟲代碼正文


工程目錄:

所需要的jar包為:   jsoup-1.10.2.jar

 

/**
 * Created by wangzheng on 2017/2/19.
 */
public class Article {

    /**
     * 文章鏈接的相對地址
     */
    private String address;

    /**
     * 文章標題
     */
    private String title;

    /**
     * 文章簡介
     */
    private String desption;

    /**
     * 文章發表時間
     */
    private String time;

    public String getAddress() {
        return address;
    }

    public void setAddress(String address) {
        this.address = address;
    }

    public String getTitle() {
        return title;
    }

    public void setTitle(String title) {
        this.title = title;
    }

    public String getDesption() {
        return desption;
    }

    public void setDesption(String desption) {
        this.desption = desption;
    }

    public String getTime() {
        return time;
    }

    public void setTime(String time) {
        this.time = time;
    }
}
/**
 * Created by wangzheng on 2017/2/19.
 */
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

public class First {
    // 需要進行爬取得博客首頁
    // private static final String URL = "http://blog.csdn.net/guolin_blog";
    private static final String URL = "http://blog.csdn.net/qq_33599520/article/list/1";

    public static void main(String[] args) throws IOException {

        // 獲取url地址的http鏈接Connection
        Connection conn = Jsoup.connect(URL)    // 博客首頁的url地址
                .userAgent("Mozilla/5.0 (Windows NT 6.1; rv:47.0) Gecko/20100101 Firefox/47.0") // http請求的浏覽器設置
                .timeout(5000)   // http連接時長
                .method(Connection.Method.GET);  // 請求類型是get請求,http請求還是post,delete等方式
        //獲取頁面的html文檔
        Document doc = conn.get();
        Element body = doc.body();

        // 將爬取出來的文章封裝到Artcle中,並放到ArrayList裡面去
        List<Article> resultList = new ArrayList<Article>();

        Element articleListDiv = body.getElementById("article_list");
        Elements articleList = articleListDiv.getElementsByClass("list_item");
        for(Element article : articleList){
            Article articleEntity = new Article();
            Element linkNode = (article.select("div h1 a")).get(0);
            Element desptionNode = (article.getElementsByClass("article_description")).get(0);
            Element articleManageNode = (article.getElementsByClass("article_manage")).get(0);

            articleEntity.setAddress(linkNode.attr("href"));
            articleEntity.setTitle(linkNode.text());
            articleEntity.setDesption(desptionNode.text());
            articleEntity.setTime(articleManageNode.select("span:eq(0").text());

            resultList.add(articleEntity);
        }
        // 遍歷輸出ArrayList裡面的爬取到的文章
        System.out.println("文章總數:" + resultList.size());
        for(Article article : resultList) {
            System.out.println("文章絕對路勁地址:http://blog.csdn.net" + article.getAddress());
        }
    }
}
/**
 * Created by wangzheng on 2017/2/19.
 */
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import org.jsoup.*;
import org.jsoup.nodes.*;
import org.jsoup.select.*;

public class Main {

    private static final String URL = "http://blog.csdn.net/qq_33599520";

    public static void main(String[] args) throws IOException {
        Connection conn = Jsoup.connect(URL)
                .userAgent("Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:50.0) Gecko/20100101 Firefox/50.0")
                .timeout(5000)
                .method(Connection.Method.GET);
        Document doc = conn.get();
        Element body = doc.body();

        // 獲取總頁數
        String totalPageStr = body.getElementById("papelist").select("span:eq(0)").text();
        String regex = ".+共(\\d+)頁";
        totalPageStr = totalPageStr.replaceAll(regex, "$1");
        int totalPage = Integer.parseInt(totalPageStr);
        int pageNow = 1;
        List<Article> articleList = new ArrayList<Article>();
        for(pageNow = 1; pageNow <= totalPage; pageNow++){
            articleList.addAll(getArtitcleByPage(pageNow));
        }
        // 遍歷輸出博主所有的文章
        for(Article article : articleList) {
            System.out.println("文章標題:" + article.getTitle());
            System.out.println("文章絕對路勁地址:http://blog.csdn.net" + article.getAddress());
            System.out.println("文章簡介:" + article.getDesption());
            System.out.println("發表時間:" + article.getTime());
        }
    }

    public static List<Article> getArtitcleByPage(int pageNow) throws IOException{

        Connection conn = Jsoup.connect(URL + "/article/list/" + pageNow)
                .userAgent("Mozilla/5.0 (Windows NT 6.1; rv:47.0) Gecko/20100101 Firefox/47.")
                .timeout(5000)
                .method(Connection.Method.GET);
        Document doc = conn.get();
        Element body = doc.body();
        List<Article> resultList = new ArrayList<Article>();

        Element articleListDiv = body.getElementById("article_list");
        Elements articleList = articleListDiv.getElementsByClass("list_item");
        for(Element article : articleList){
            Article articleEntity = new Article();
            Element linkNode = (article.select("div h1 a")).get(0);
            Element desptionNode = (article.getElementsByClass("article_description")).get(0);
            Element articleManageNode = (article.getElementsByClass("article_manage")).get(0);

            articleEntity.setAddress(linkNode.attr("href"));
            articleEntity.setTitle(linkNode.text());
            articleEntity.setDesption(desptionNode.text());
            articleEntity.setTime(articleManageNode.select("span:eq(0").text());

            resultList.add(articleEntity);
        }
        return resultList;
    }
}

 

  1. 上一頁:
  2. 下一頁:
Copyright © 程式師世界 All Rights Reserved