<?php //打印页面的编码设置 header("Content-type: text/html; charset=utf-8"); //设置服务器采集响应时间 set_time_limit( 60 * 15 ); //内容保存文件夹 $sourceDir = 'pageFile/'; //内容输出文件夹 $textDir = 'pageText/'; //新建文件夹 if(!is_dir($textDir) || !is_dir('mid') || !is_dir('line_len') || !is_dir('bl_size')) { mkdir( $textDir ); mkdir( 'mid' ); mkdir( 'line_len' ); mkdir( 'bl_size' ); } //采集的网站 $qianzui = 'http://www.thinkphp.cn'; //采集的目标网址页(列表页) $url = 'http://www.thinkphp.cn/code/index.html'; //获取目标页所有的a链接 $gather = new gather(); //获取目标页内容 $html = $gather->geturlfile($url); // 确定源编码 $encoding = mb_detect_encoding($html, array("ASCII", "UTF-8", "GB2312", "GBK", "BIG5")); // 转码 if($encoding != "UTF-8") $html = mb_convert_encoding($html, "UTF-8", $encoding); //定义采集列表区间 $start = '<div class="box extend">'; $end = '/教程列表 -->'; //获取区间内的文章URL和TITLE $code = $gather->get_sub_content($html, $start, $end); $newsAry = $gather->get_all_url($code); //打印出结果 //$gather->vd($newsAry); //组合完整详细页url $url_ary = array(); foreach($newsAry['url'] as $v) { if(!substr_count($v,'functions') && !substr_count($v,'system') && !substr_count($v,'examples') && !substr_count($v,'#')) $url_ary[] = $qianzui . $v; } //P($url_ary); //die(); //要采集的详细页总数 $fileNum = count($url_ary); $BL_BLOCK = 3; //require_once( 'class.textExtract.php' ); //提取详细页内容 for( $j = 0; $j <= $fileNum; $j++ ) { // 获得网页内容 $content = file_get_contents( $url_ary[$j] ); //实例化提取类 $iTextExtractor = new textExtract( $content, $BL_BLOCK ); $text = $iTextExtractor->getPlainText(); $lineNum = count( $iTextExtractor->textLines ); echo $lineNum . '<br />'; // 输出预处理后得到的结果 $midFileName = 'mid/mid_' . $j . '.txt'; $fod = fopen( $midFileName, "w" ); foreach( $iTextExtractor->textLines as $line ) { fprintf( $fod, "%sn", $line ); } fclose( $fod ); // 输出经过预处理后每行的长度 $lineLenFileName = 'line_len/line_len_' . $j . '.txt'; $fod = fopen( $lineLenFileName, "w" ); foreach( $iTextExtractor->textLines as $line ) { fprintf( $fod, "%sn", strlen($line) ); } fclose( $fod ); // 输出行块内容的长度 $blSizeFileName = 'bl_size/bl_size_' . $j . '.txt'; $fptr = fopen( $blSizeFileName, "w" ); foreach( $iTextExtractor->blksLen as $blkLen ) { fprintf( $fptr, "%dn", $blkLen ); } fclose( $fptr ); // 输出正文内容 $textFileName = $textDir . $j . '.html'; $fod = fopen( $textFileName, "w" ); fprintf( $fod, "%sn", $text ); fclose( $fod ); } /** * 页面内容采集类 * @author Milkcy QQ:9877633 * @copyright (C) 2012-2015 TCCMS.COM * @lastmodify 2012-07-10 14:00 */ class gather { public $pagestring = ''; private $db; function __construct() { global $db; $this->db = $db; } function geturlfile($url) { $url = trim($url); $content = ''; if (extension_loaded('curl')) { $ch = curl_init(); curl_setopt($ch, CURLOPT_URL, $url); curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1); curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1); curl_setopt($ch, CURLOPT_HEADER, 0); $content = curl_exec($ch); curl_close($ch); } else { $content = file_get_contents($url); } return trim($content); } function get_all_url($code) { preg_match_all('/<a.+?href=["|']?([^>"' ]+)["|']?s*[^>]*>([^>]+)</a>/is', $code, $arr); return array('name' => $arr[2], 'url' => $arr[1]); } function get_sub_content($str, $start, $end) { $start = trim($start); $end = trim($end); if ($start == '' || $end == '') { return $str; } $str = explode($start, $str); $str = explode($end, $str[1]); return $str[0]; } function vd($var) { echo "<div style="border:1px solid #ddd;background:#F7F7F7;padding:5px 10px;">rn"; echo "<pre style="font-family:Arial,Vrinda;font-size:14px;">rn"; var_dump($var); echo "rn</pre>rn"; echo "</div>"; } } //内容提取封装类 # Script - class.textExtract.php /** * textExtract - text extraction class * Created on 2010-08-10 * author: Wenfeng Xuan * Email: wfxuan@insun.hit.edu.cn * Blog: http://hi.baidu.com/xwf_like */ class textExtract { /////////////////////////////////// // MEMBERS /////////////////////////////////// /** * record the web page's source code * @var string */ public $rawPageCode = ''; /** * record the text after preprocessing * @var array */ public $textLines = array(); /** * record the length of each block * @var array */ public $blksLen = array(); /** * record the final extracted text * @var string */ public $text = ''; /** * set the size of each block ( regards how many single lines as a block ) * it is the only parameter of this method * @var int */ public $blkSize; /////////////////////////////////// // METHODS /////////////////////////////////// /** * Set the value of relevant members * @param string $_rawPageCode * @param int $_blkSize * @return void */ function __construct( $_rawPageCode, $_blkSize = 3 ) { $this->rawPageCode = $_rawPageCode; $this->blkSize = $_blkSize; } /** * Preprocess the web page's source code * @return string */ function preProcess() { $content = $this->rawPageCode; // 1. DTD information $pattern = '/<!DOCTYPE.*?>/si'; $replacement = ''; $content = preg_replace( $pattern, $replacement, $content ); // 2. HTML comment $pattern = '/<!--.*?-->/s'; $replacement = ''; $content = preg_replace( $pattern, $replacement, $content ); // 3. Java Script $pattern = '/<script.*?>.*?</script>/si'; $replacement = ''; $content = preg_replace( $pattern, $replacement, $content ); // 4. CSS $pattern = '/<style.*?>.*?</style>/si'; $replacement = ''; $content = preg_replace( $pattern, $replacement, $content ); // 5. HTML TAGs $pattern = '/<.*?>/s'; $replacement = ''; $content = preg_replace( $pattern, $replacement, $content ); // 6. some special charcaters $pattern = '/&.{1,5};|&#.{1,5};/'; $replacement = ''; $content = preg_replace( $pattern, $replacement, $content ); return $content; } /** * Split the preprocessed text into lines by 'n' * after replacing "rn", 'n', and 'r' with 'n' * @param string @rawText * @return void */ function getTextLines( $rawText ) { // do some replacement $order = array( "rn", "n", "r" ); $replace = 'n'; $rawText = str_replace( $order, $replace, $rawText ); $lines = explode( 'n', $rawText ); foreach( $lines as $line ) { // remove the blanks in each line $tmp = preg_replace( '/s+/s', '', $line ); $this->textLines[] = $tmp; } } /** * Calculate the blocks' length * @return void */ function calBlocksLen() { $textLineNum = count( $this->textLines ); // calculate the first block's length $blkLen = 0; for( $i = 0; $i < $this->blkSize; $i++ ) { $blkLen += strlen( $this->textLines[$i] ); } $this->blksLen[] = $blkLen; // calculate the other block's length using Dynamic Programming method for( $i = 1; $i < ($textLineNum - $this->blkSize); $i++ ) { $blkLen = $this->blksLen[$i - 1] + strlen( $this->textLines[$i - 1 + $this->blkSize] ) - strlen( $this->textLines[$i - 1] ); $this->blksLen[] = $blkLen; } } /** * Extract the text from the web page's source code * according to the simple idea: * [the text should be the longgest continuous content * in the web page] * @return string */ function getPlainText() { $preProcText = $this->preProcess(); $this->getTextLines( $preProcText ); $this->calBlocksLen(); $start = $end = -1; $i = $maxTextLen = 0; $blkNum = count( $this->blksLen ); while( $i < $blkNum ) { while( ($i < $blkNum) && ($this->blksLen[$i] == 0) ) $i++; if( $i >= $blkNum ) break; $tmp = $i; $curTextLen = 0; $portion = ''; while( ($i < $blkNum) && ($this->blksLen[$i] != 0) ) { $portion .= $this->textLines[$i]; $curTextLen += $this->blksLen[$i]; $i++; } if( $curTextLen > $maxTextLen ) { $this->text = $portion; $maxTextLen = $curTextLen; $start = $tmp; $end = $i - 1; } } echo ($start + 1) . '<br />' . ($end + 1) . '<br />'; return $this->text; } } ?>