程序師世界是廣大編程愛好者互助、分享、學習的平台,程序師世界有你更精彩!
首頁
編程語言
C語言|JAVA編程
Python編程
網頁編程
ASP編程|PHP編程
JSP編程
數據庫知識
MYSQL數據庫|SqlServer數據庫
Oracle數據庫|DB2數據庫
 程式師世界 >> 編程語言 >> 網頁編程 >> PHP編程 >> 關於PHP編程 >> php正則抓取整個域名下的圖片

php正則抓取整個域名下的圖片

編輯:關於PHP編程

代碼出處:jUnion

適用平台:Windows, Linux(Ubuntu),php-5.2.5+,Apache

功能:抓取整個站點的圖片,暫無借助php的curl插件開發, 後期完善

配置:config目錄下
     domain_name:域名(默認:bizhibar.com)
     request_site:網站網址(默認:http://www.bizhibar.com/)
     request_url:從網站的哪個頁面開始(默認:http://www.bizhibar.com/)
     accept_type: 圖片類型(默認:gif, bmp, png, ico,  jpg, jpeg)
     save_path:圖片保存路徑(默認:savefiles/)
     partition_name:圖片保存目錄名稱前綴(默認:img_)
     dir_file_limit: 每個目錄容許多少個文件(默認:100)
     serialize_img_size: 當讀取了多少個圖片地址才緩存到cache目錄下的accompImg文件當中,下次繼續抓取的時候會忽略這些地址。(默認:30)
     serialize_url_size:與serialize_url_size一樣,已讀取多少個鏈接地址才緩存到cache目錄
下的overURL,下次繼續抓取的時候忽略這些地址。(默認:10)

說明:歡迎諸君批評指教,有任何新問題或者需要改進的地方,請您反饋給我

<?php
set_time_limit(0);
require dirname(__FILE__).DIRECTORY_SEPARATOR.'include'.DIRECTORY_SEPARATOR.'Capture.const.php';
require __Home__.'include'.__Os__.'Capture.class.php';

$_cfg = array(
	'site' => __Home__.'config'.__Os__.'capture.site.php',
	'preg' => __Home__.'config'.__Os__.'capture.preg.php',
	'accompImg' => __Home__.'cache'.__Os__.'accompImg',
	'overURL'   => __Home__.'cache'.__Os__.'overURL'
);

$_parse = new Capture( $_cfg );
$_parse->parseQuestUrl();

?>
<?php
/**
 * The main class
 * @author pankai<[email protected]>
 * @date 2013-08-10
 */
class Capture {
	private static $_Config = array();
	
	private static $_CapSite = NULL;
	private static $_CapPreg = NULL;
	
	private static $_overURL = array();
	
	private $_mark = FALSE;
	private static $_markTime = 1;
	/**
	 * initialize the main class: Capture
	 * @param $_cfg array
	 */
	public function __construct( &$_cfg ) {
		self::$_Config = &$_cfg;
		
		self::$_CapSite = require $_cfg['site'];
		self::$_CapPreg = require $_cfg['preg'];
		
		foreach( self::$_CapPreg as $_key => $_value ) {
			self::$_CapPreg[$_key] = str_replace( '_request_site', self::$_CapSite['request_site'], $_value );
		}
		
		self::import( 'file.OperateFile' );
		if( file_exists( $_cfg['overURL'] ) && filesize( $_cfg['overURL'] ) > 0 ) {
			$_contents = OperateFile::readText( $_cfg['overURL'], filesize( $_cfg['overURL'] ) );
			self::$_overURL = unserialize( $_contents );
		}
		
		self::import('pivotal.Pivotal');
		if( file_exists( $_cfg['accompImg'] ) && filesize( $_cfg['accompImg'] ) > 0 ) {
			$_contents = OperateFile::readText( $_cfg['accompImg'], filesize( $_cfg['accompImg'] ) );
			Pivotal::$_accompImg = unserialize( $_contents );
		}
		
	}
	/**
	 * load class, follow Java pragrammer(package): import com.jUnion.Capture
	 * @param $_class
	 */
	public static function import( $_class ) {
		require_once __Home__.'include'.__Os__.str_replace( '.', __Os__, $_class ).'.class.php';
	}
	
	/**
	 * create an instance of Pivotal class
	 * @param $_source
	 */
	private function getCapInstance( &$_source ) {
		$this->_mark = FALSE;
		
		$_Captal = new Pivotal( self::$_Config, $_source );
		$_tagA = $_Captal->parseUrl();
		
		$this->_mark = TRUE;
		
		return $_tagA;
	}
	
	/**
	 * go forward one by one
	 * @param $_tagArr
	 */
	private function roundTagA( &$_tagArr ) {
		if( $_tagArr == NULL ) {
			return;
		}
		$_tagArrLength = count( $_tagArr );
		for( $i = 0; $i < $_tagArrLength; $i ++ ) {
			if( is_array( $_tagArr[ $i ] ) ) {
				$this->roundTagA( $_tagArr[ $i ] );  
			}
			else {
				if( stripos( $_tagArr[$i], self::$_CapSite['domain_name'] )
					=== FALSE ) {
						continue;
					}
				if( in_array( $_tagArr[$i], self::$_overURL ) ) {
					continue;
				}
				self::$_overURL[] = $_tagArr[$i];
				if( count( self::$_overURL ) % self::$_CapSite['serialize_url_size'] == 0 ) {
					OperateFile::setText( self::$_Config['overURL'], serialize( self::$_overURL ) );
				}
				do {
					$_tagA = $this->getCapInstance( Http::get( $_tagArr[$i] ) );
					sleep( self::$_CapSite['preform_page_time'] * self::$_markTime );
					if( $this->_mark === TRUE ) {
						self::$_markTime = self::$_CapSite['preform_page_time'];
						break;
					}
					self::$_markTime *= 2;
				} while( true );
				/* parse the main page and return next page */
				$this->roundTagA( $_tagA );
			}
		}
	}
	//www.bkjia.com
	public function parseQuestUrl() {
		self::import('http.Http');
		$_round_Arr = $this->getCapInstance( Http::get( self::$_CapSite['request_url'] ) );
		$this->roundTagA( $_round_Arr ); 
	}
}

?>

  1. 上一頁:
  2. 下一頁:
Copyright © 程式師世界 All Rights Reserved