代碼出處:jUnion
適用平台:Windows, Linux(Ubuntu),php-5.2.5+,Apache
功能:抓取整個站點的圖片,暫無借助php的curl插件開發, 後期完善
配置:config目錄下
domain_name:域名(默認:bizhibar.com)
request_site:網站網址(默認:http://www.bizhibar.com/)
request_url:從網站的哪個頁面開始(默認:http://www.bizhibar.com/)
accept_type: 圖片類型(默認:gif, bmp, png, ico, jpg, jpeg)
save_path:圖片保存路徑(默認:savefiles/)
partition_name:圖片保存目錄名稱前綴(默認:img_)
dir_file_limit: 每個目錄容許多少個文件(默認:100)
serialize_img_size: 當讀取了多少個圖片地址才緩存到cache目錄下的accompImg文件當中,下次繼續抓取的時候會忽略這些地址。(默認:30)
serialize_url_size:與serialize_url_size一樣,已讀取多少個鏈接地址才緩存到cache目錄
下的overURL,下次繼續抓取的時候忽略這些地址。(默認:10)
說明:歡迎諸君批評指教,有任何新問題或者需要改進的地方,請您反饋給我
<?php set_time_limit(0); require dirname(__FILE__).DIRECTORY_SEPARATOR.'include'.DIRECTORY_SEPARATOR.'Capture.const.php'; require __Home__.'include'.__Os__.'Capture.class.php'; $_cfg = array( 'site' => __Home__.'config'.__Os__.'capture.site.php', 'preg' => __Home__.'config'.__Os__.'capture.preg.php', 'accompImg' => __Home__.'cache'.__Os__.'accompImg', 'overURL' => __Home__.'cache'.__Os__.'overURL' ); $_parse = new Capture( $_cfg ); $_parse->parseQuestUrl(); ?>
<?php
/**
* The main class
* @author pankai<530911044@qq.com>
* @date 2013-08-10
*/
class Capture {
private static $_Config = array();
private static $_CapSite = NULL;
private static $_CapPreg = NULL;
private static $_overURL = array();
private $_mark = FALSE;
private static $_markTime = 1;
/**
* initialize the main class: Capture
* @param $_cfg array
*/
public function __construct( &$_cfg ) {
self::$_Config = &$_cfg;
self::$_CapSite = require $_cfg['site'];
self::$_CapPreg = require $_cfg['preg'];
foreach( self::$_CapPreg as $_key => $_value ) {
self::$_CapPreg[$_key] = str_replace( '_request_site', self::$_CapSite['request_site'], $_value );
}
self::import( 'file.OperateFile' );
if( file_exists( $_cfg['overURL'] ) && filesize( $_cfg['overURL'] ) > 0 ) {
$_contents = OperateFile::readText( $_cfg['overURL'], filesize( $_cfg['overURL'] ) );
self::$_overURL = unserialize( $_contents );
}
self::import('pivotal.Pivotal');
if( file_exists( $_cfg['accompImg'] ) && filesize( $_cfg['accompImg'] ) > 0 ) {
$_contents = OperateFile::readText( $_cfg['accompImg'], filesize( $_cfg['accompImg'] ) );
Pivotal::$_accompImg = unserialize( $_contents );
}
}
/**
* load class, follow Java pragrammer(package): import com.jUnion.Capture
* @param $_class
*/
public static function import( $_class ) {
require_once __Home__.'include'.__Os__.str_replace( '.', __Os__, $_class ).'.class.php';
}
/**
* create an instance of Pivotal class
* @param $_source
*/
private function getCapInstance( &$_source ) {
$this->_mark = FALSE;
$_Captal = new Pivotal( self::$_Config, $_source );
$_tagA = $_Captal->parseUrl();
$this->_mark = TRUE;
return $_tagA;
}
/**
* go forward one by one
* @param $_tagArr
*/
private function roundTagA( &$_tagArr ) {
if( $_tagArr == NULL ) {
return;
}
$_tagArrLength = count( $_tagArr );
for( $i = 0; $i < $_tagArrLength; $i ++ ) {
if( is_array( $_tagArr[ $i ] ) ) {
$this->roundTagA( $_tagArr[ $i ] );
}
else {
if( stripos( $_tagArr[$i], self::$_CapSite['domain_name'] )
=== FALSE ) {
continue;
}
if( in_array( $_tagArr[$i], self::$_overURL ) ) {
continue;
}
self::$_overURL[] = $_tagArr[$i];
if( count( self::$_overURL ) % self::$_CapSite['serialize_url_size'] == 0 ) {
OperateFile::setText( self::$_Config['overURL'], serialize( self::$_overURL ) );
}
do {
$_tagA = $this->getCapInstance( Http::get( $_tagArr[$i] ) );
sleep( self::$_CapSite['preform_page_time'] * self::$_markTime );
if( $this->_mark === TRUE ) {
self::$_markTime = self::$_CapSite['preform_page_time'];
break;
}
self::$_markTime *= 2;
} while( true );
/* parse the main page and return next page */
$this->roundTagA( $_tagA );
}
}
}
//www.bkjia.com
public function parseQuestUrl() {
self::import('http.Http');
$_round_Arr = $this->getCapInstance( Http::get( self::$_CapSite['request_url'] ) );
$this->roundTagA( $_round_Arr );
}
}
?>