之前寫過curl批處理采集數據,這裡貼上完整版本,代碼很簡單,廢話不說,上代碼,新手歡迎指教!!!
代碼只寫到 獲取到鏈接了,至於排名 後邊數組的鍵不就是排名喽。。。
1 <?php
2 /**
3 * Based on yahoo access to data
4 *
5 * @author chujiu <527891885@qq.com>
6 * @copyright 2014.04.26 By chujiu
7 * @version 0.2.1 2014.04.26
8 */
9
10 class DataCollectionRank {
11
12 const PAGE = 10;
13 public $path = '';
14 public $main = 91;
15
16 // 添加curl句柄 返回資源
17 private function _gather_data($keyword) {
18 if(empty($keyword)) {
19 return '';
20 }
21 $chs = array(); // 句柄
22 $mh = curl_multi_init();
23 for( $i=1; $i<=$this->main; $i+=self::PAGE ) {
24 $url = 'http://search.yahoo.co.jp/search?p='.urlencode($keyword).'&tid=top_ga1_sa&ei=UTF-8&aq=-1&oq='.urlencode($keyword).'&pstart=1&fr=top_ga1_sa&b='.$i;
25 $ch = curl_init();
26 //設置選項
27 curl_setopt_array($ch, array(
28 CURLOPT_URL => $url,
29 CURLOPT_HEADER => false,
30 CURLOPT_SSL_VERIFYPEER => false,
31 CURLOPT_RETURNTRANSFER => true,
32 CURLOPT_TIMEOUT => 30,
33 CURLOPT_AUTOREFERER => true
34 )
35 );
36 curl_multi_add_handle($mh, $ch); // 添加批處理句柄
37 $chs['handle'][$i]['ch'] = $ch;
38 $chs['handle'][$i]['url'] = $url;
39 }
40 $chs['mh'] = $mh;
41 return $chs;
42 }
43
44 // 處理CURL請求
45 public function exec_curl_get_data($keyword, $path) {
46 $error = '';
47 $this->path = $path;
48 $chs = $this->_gather_data($keyword);
49 if(empty($chs)) return '';
50 // 執行批處理句柄
51 $active = null;
52 do {
53 $mrc = curl_multi_exec($chs['mh'],$active);
54 //$info = curl_multi_info_read($chs['mh']);
55 } while ($active > 0);
56 // 獲取數據
57 $responses = array();
58 foreach($chs['handle'] as $k=>$ch){
59 if(curl_error($ch['ch'])){
60 $error .= "\n".'error提示:'.curl_error($ch['ch']).'-------URL:'.$ch['url'].'--------時間:'.date('Y-d-m H:i:s',time())."\n";
61 } else {
62 $responses[$k]['data'] = curl_multi_getcontent( $ch['ch'] );
63 }
64
65 //curl_multi_info_read($mh);
66 // close current handler
67 curl_multi_remove_handle($chs['mh'], $ch['ch']);
68 curl_close($ch['ch']);
69 }
70 //關閉curl 批處理
71 curl_multi_close($chs['mh']);
72 $str = '';
73 if($error != '') {
74 $this->_writeFile('get_rank_log.txt', $error, 'ab+');
75 }
76 foreach ($responses as $val) {
77 if(!empty($val['data'])) {
78 $str.= $this->_get_keyword_link_preg($val['data']);
79 }
80 }
81 $str = substr($str, 0 ,-1);
82 $contents = explode('|', $str);
83 return $contents;
84 }
85
86 // 過濾數據 獲取鏈接
87 private function _get_keyword_link_preg ($str) {
88 $res = '';
89 if(empty($str)) {
90 return '';
91 }
92 $arr = explode('<div id="web">', $str);
93 $arr1 = explode('<div id="posS" class="spns">', $arr[1]);
94 $arr2 = preg_replace('#<div id=\"pg\">[\s\S]+#', '', $arr1[0]);
95 $arr3 = preg_replace('#<div id=\"rel\">[\s\S]+#', '', $arr2);
96 $arr4 = preg_replace('#<em>[\s\S]+?</em>#', '', $arr3);
97 if(preg_match_all('#href=\"(.*?)\">#',$arr4,$arr5) !== false) {
98 foreach($arr5[1] as $val) {
99 $res.= urldecode($val).'|';
100 }
101 }
102 return $res;
103 }
104
105 // 寫入文件
106 public function _writeFile($fileName, $data, $method="rb+", $iflock=1, $check=1, $chmod=1){
107 $check && @strpos($this->path.'/'.$fileName, '..')!==false && exit('403 Forbidden!');
108 @touch($this->path.'/'.$fileName);
109 $handle = @fopen($this->path.'/'.$fileName, $method);
110 if($iflock) {
111 @flock($handle,LOCK_EX);
112 }
113 $fw = @fwrite($handle,$data);
114 if($method == "rb+") ftruncate($handle, strlen($data));
115 fclose($handle);
116 $chmod && @chmod($this->path.'/'.$fileName,0777);
117 }
118 }
119 ?>
1 function array_unique_fb($array){
2 $temp = array();
3 $data = array();
4 foreach ($array as $value){
5 $value = join(",",$value); //降維,也可以用implode,將一維數組轉換為用逗號連接的字符串
6 $temp[] = $value;
7 }
8 $temp = array_flip(array_flip($temp)); //去掉重復的字符串,也就是重復的一維數組
9 foreach ($temp as $k => $value){
10 $temp[$k] = explode(",",$value); //再將拆開的數組重新組裝
11 }
12 foreach ($temp as $key => $value) {
13 $data[$key]['keyword'] = $value[0];
14 $data[$key]['domain'] = $value[1];
15 }
16 return $data;
17 }