程序師世界是廣大編程愛好者互助、分享、學習的平台,程序師世界有你更精彩!
首頁
編程語言
C語言|JAVA編程
Python編程
網頁編程
ASP編程|PHP編程
JSP編程
數據庫知識
MYSQL數據庫|SqlServer數據庫
Oracle數據庫|DB2數據庫
 程式師世界 >> 編程語言 >> C語言 >> C >> 關於C >> 用CI框架寫的抓取文章標題、內容、來源圖片代碼

用CI框架寫的抓取文章標題、內容、來源圖片代碼

編輯:關於C

整理了一下,之前的有些亂看不清楚,這是我其中一個抓取頁面的代碼,做一個備份,方便以後調用,本人能力有限希望大家多給點意見

public function snatch()
 {
      set_time_limit(0);
      $this->benchmark->mark('code_start');
      /*獲取不同類別的二手車新聞*/
      for($i=1;$i<=4;$i++)
      {
       $url = 'http://news.2sche.cn/list.asp?stype='.$i;
       $result = $this->curl_snatch($url);
  
       preg_match_all('/<strong>\d\/(.*?)<\/strong>/', $result, $page_news);
       //print_r($page_news);
       //echo '<hr>';
       /*獲取單個類別下所有分頁頁面的新聞列表*/
       for($j=1;$j<=$page_news[1];$j++)
       {
            if(1 == $j)
            {
             $url_news = 'http://news.2sche.cn/list.asp?stype='.$i;
            }
            else
            {
             $url_news = 'http://news.2sche.cn/list.asp?page='.$j.'&stype='.$i;
            }
            $result_news = $this->curl_snatch($url_news);
            preg_match_all('/<td width="516" height="28" class="z14"><a href="(.*?)" target="_blank">.*?<\/a><\/td>/sim', $result_news, $url_newslist);
            //print_r($url_newslist);
            /*遍歷列表頁每個url*/
            foreach($url_newslist[1] as $url_newslists)
            {
                 $url_newsinfo = 'http://news.2sche.cn/'.$url_newslists;
                 $result_newsinfo = $this->curl_snatch($url_newsinfo);
                 /*獲取標題*/
                 preg_match_all('/<h3 class="title"><strong>(.*?)<\/strong><\/h3>/sim', $result_newsinfo, $title);
                 //print_r($title[1]);
                 /*獲取來源*/
                 preg_match_all('/<td style="BORDER-BOTTOM: #666666 1PX DASHED" width="155"><span class="right">【來源:(.*?)&nbsp;】<\/span><\/td>/sim', $result_newsinfo, $source);
                 //print_r($source[1]);
                 /*獲取內容*/
                 preg_match_all('/<td colspan="2" class="z14" style="padding-top:20px;padding-left:1px;padding-bottom:20px;line-height:25px">(.*?)<\/td>/sim', $result_newsinfo, $content);
                 //print_r($content[1][0]);
                 /*獲取內容裡的所有圖片url*/
                 //preg_match_all('/<IMG alt="" src="(.*?)">/sim', $content[1][0], $img);
                 preg_match_all('/<IMG.*?src="(.*?)".*?>/sim', $content[1][0], $img);
                 //echo 'ddddd';
                 //print_r($img[1]);
                 //echo 'dddd<br>';
                 //exit;
                 $picture = '';
                 foreach($img[1] as $imgs)
                 {
                      //echo $imgs;
                      //echo '<br>';
                      if(strpos($imgs, 'http://') === false)
                      {
                           continue;
                      }
                     $img_source = file_get_contents($imgs);
     
                      /*獲取單個圖片的名稱*/
     
                      $img_names = trim(strrchr($imgs,'/'), '/');
                      //print_r($img_name);
     
                      //echo $img_names;
                      //exit;
                      $picture .= $img_names.':';
                      file_put_contents("./static/uploads/news/".$img_names, $img_source);
     
                      //圖片路徑替換
                      $img_path = '/static/uploads/news/'.$img_names;
                      $content[1][0] = str_replace($imgs, $img_path, $content[1][0]);
                 }
                 //print_r($picture) ;
                 //echo 'hhhh<br>';
                 //print_r($content[1][0]);
    
                 //echo '<br>';
                 $data = array(
                                'title' => $title[1][0],
                                'source' => $source[1][0],
                                'contents' => trim($content[1][0]),
                                'picture' => $picture,
                                'style' => $i,
                                'create_time' => time(),
                             );
                 if(!$this->News_model->add($data))
                 {
                      continue;
                 }
                 //print_r($data);exit;
            }
            echo '<hr>';
   
       }
  }
$this->benchmark->mark('code_end');
echo $this->benchmark->elapsed_time('code_start', 'code_end');
 
 }

function curl_snatch($url='http://www.2sche.cn/buy.asp')
 {
        $url = trim($url); 
        $content = ''; 
        if (extension_loaded('curl'))
        { 
               $ch = curl_init();
               // 2. 設置選項,包括URL
               curl_setopt($ch, CURLOPT_URL, $url);
               curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
               curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); 
               curl_setopt($ch, CURLOPT_HEADER, 0);
               // 3. 執行並獲取HTML文檔內容
               $output = curl_exec($ch);
               $content = iconv("GBK", "UTF-8", $output);
               if ($output === FALSE) {
                     echo "cURL Error: " . curl_error($ch);
               }
               //$info = curl_getinfo($ch);
               //echo '獲取'. $info['url'] . '耗時'. $info['total_time'] . '秒';
 
 
               // 4. 釋放curl句柄
               curl_close($ch);
       }
       else
       { 
              $content = file_get_contents($url); 
       } 
       return trim($content); 
}


作者:李佳順
  1. 上一頁:
  2. 下一頁:
Copyright © 程式師世界 All Rights Reserved