php采集页面主文本内容(非正则抓取,采用直接提取)

2017-04-09 后端
<?php
//打印页面的编码设置
header("Content-type: text/html; charset=utf-8");
//设置服务器采集响应时间
set_time_limit( 60 * 15 );
 
//内容保存文件夹
$sourceDir = 'pageFile/';
//内容输出文件夹
$textDir = 'pageText/';
 
//新建文件夹
if(!is_dir($textDir) || !is_dir('mid') || !is_dir('line_len') || !is_dir('bl_size'))
{
    mkdir( $textDir );
    mkdir( 'mid' );
    mkdir( 'line_len' );
    mkdir( 'bl_size' );
}
 
//采集的网站
$qianzui = 'http://www.thinkphp.cn';
 
//采集的目标网址页(列表页)
$url = 'http://www.thinkphp.cn/code/index.html';
 
//获取目标页所有的a链接
$gather = new gather();
//获取目标页内容
$html = $gather->geturlfile($url);
 // 确定源编码
    $encoding = mb_detect_encoding($html, array("ASCII", "UTF-8", "GB2312", "GBK", "BIG5"));
 
    // 转码
    if($encoding != "UTF-8")
        $html = mb_convert_encoding($html, "UTF-8", $encoding);


//定义采集列表区间
$start = '<div class="box extend">';
$end = '/教程列表 -->';
//获取区间内的文章URL和TITLE
$code = $gather->get_sub_content($html, $start, $end);
$newsAry = $gather->get_all_url($code);
//打印出结果
//$gather->vd($newsAry);
 
//组合完整详细页url
$url_ary = array();
foreach($newsAry['url'] as $v)
{
    if(!substr_count($v,'functions') && !substr_count($v,'system') && !substr_count($v,'examples') && !substr_count($v,'#'))
        $url_ary[] = $qianzui . $v;
}
//P($url_ary);
//die();
 
//要采集的详细页总数
$fileNum = count($url_ary);
$BL_BLOCK = 3;
 
//require_once( 'class.textExtract.php' );
 
//提取详细页内容
for( $j = 0; $j <= $fileNum; $j++ ) {
    // 获得网页内容
    $content = file_get_contents( $url_ary[$j] );
 
    //实例化提取类
    $iTextExtractor = new textExtract( $content, $BL_BLOCK );
    $text             = $iTextExtractor->getPlainText();
    $lineNum         = count( $iTextExtractor->textLines );
    echo $lineNum . '<br />';
 
    // 输出预处理后得到的结果
    $midFileName = 'mid/mid_' . $j . '.txt';
    $fod          = fopen( $midFileName, "w" );
    foreach( $iTextExtractor->textLines as $line )
    {
        fprintf( $fod, "%sn", $line );
    }
    fclose( $fod );
 
    // 输出经过预处理后每行的长度
    $lineLenFileName = 'line_len/line_len_' . $j . '.txt';
    $fod             = fopen( $lineLenFileName, "w" );
    foreach( $iTextExtractor->textLines as $line )
    {
        fprintf( $fod, "%sn", strlen($line) );
    }
    fclose( $fod );
 
    // 输出行块内容的长度
    $blSizeFileName = 'bl_size/bl_size_' . $j . '.txt';
    $fptr             = fopen( $blSizeFileName, "w" );
    foreach( $iTextExtractor->blksLen as $blkLen )
    {
        fprintf( $fptr, "%dn", $blkLen );
    }
    fclose( $fptr );
 
    // 输出正文内容
    $textFileName = $textDir . $j . '.html';
    $fod           = fopen( $textFileName, "w" );
    fprintf( $fod, "%sn", $text );
    fclose( $fod );
}
 
 
 
/**
* 页面内容采集类
* @author Milkcy QQ:9877633
 * @copyright            (C) 2012-2015 TCCMS.COM
 * @lastmodify             2012-07-10 14:00
*/
class gather {
 
    public $pagestring = '';
    private $db;
 
    function __construct() {
        global $db;
        $this->db = $db;
    }
 
    function geturlfile($url) {
        $url = trim($url);
        $content = '';
        if (extension_loaded('curl')) {
            $ch = curl_init();
            curl_setopt($ch, CURLOPT_URL, $url);
            curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
            curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
            curl_setopt($ch, CURLOPT_HEADER, 0);
            $content = curl_exec($ch);
            curl_close($ch);
        } else {
            $content = file_get_contents($url);
        }
        return trim($content);
    }
 
    function get_all_url($code) {
        preg_match_all('/<a.+?href=["|']?([^>"' ]+)["|']?s*[^>]*>([^>]+)</a>/is', $code, $arr);
        return array('name' => $arr[2], 'url' => $arr[1]);
    }
 
    function get_sub_content($str, $start, $end) {
        $start = trim($start);
        $end = trim($end);
        if ($start == '' || $end == '') {
            return $str;
        }
        $str = explode($start, $str);
        $str = explode($end, $str[1]);
        return $str[0];
    }
 
    function vd($var) {
        echo "<div style="border:1px solid #ddd;background:#F7F7F7;padding:5px 10px;">rn";
        echo "<pre style="font-family:Arial,Vrinda;font-size:14px;">rn";
        var_dump($var);
        echo "rn</pre>rn";
        echo "</div>";
    }
 
}
 
 
 
//内容提取封装类
# Script - class.textExtract.php
/**
* textExtract - text extraction class
* Created on 2010-08-10
* author: Wenfeng Xuan
* Email: wfxuan@insun.hit.edu.cn
* Blog: http://hi.baidu.com/xwf_like
*/
class textExtract {
 
    ///////////////////////////////////
    // MEMBERS
    ///////////////////////////////////
 
    /**
     * record the web page's source code
     * @var string
     */
    public $rawPageCode = '';
 
    /**
     * record the text after preprocessing
     * @var array
     */
    public $textLines   = array();
 
    /**
     * record the length of each block
     * @var array
     */
    public $blksLen     = array();
 
    /**
     * record the final extracted text
     * @var string
     */
    public $text        = '';
 
    /**
     * set the size of each block ( regards how many single lines as a block )
     * it is the only parameter of this method
     * @var int
     */
    public $blkSize;
 
    ///////////////////////////////////
    // METHODS
    ///////////////////////////////////
 
    /**
     * Set the value of relevant members
     * @param string $_rawPageCode
     * @param int $_blkSize
     * @return void
     */
    function __construct( $_rawPageCode, $_blkSize = 3 ) {
        $this->rawPageCode = $_rawPageCode;
        $this->blkSize     = $_blkSize;
    }
 
    /**
     * Preprocess the web page's source code
     * @return string
     */
    function preProcess() {
        $content = $this->rawPageCode;
 
        // 1. DTD information
        $pattern = '/<!DOCTYPE.*?>/si';
        $replacement = '';
        $content = preg_replace( $pattern, $replacement, $content );
 
        // 2. HTML comment
        $pattern = '/<!--.*?-->/s';
        $replacement = '';
        $content = preg_replace( $pattern, $replacement, $content );
 
        // 3. Java Script
        $pattern = '/<script.*?>.*?</script>/si';
        $replacement = '';
        $content = preg_replace( $pattern, $replacement, $content );
 
        // 4. CSS
        $pattern = '/<style.*?>.*?</style>/si';
        $replacement = '';
        $content = preg_replace( $pattern, $replacement, $content );
 
        // 5. HTML TAGs
        $pattern = '/<.*?>/s';
        $replacement = '';
        $content = preg_replace( $pattern, $replacement, $content );
 
        // 6. some special charcaters
        $pattern = '/&.{1,5};|&#.{1,5};/';
        $replacement = '';
        $content = preg_replace( $pattern, $replacement, $content );
 
        return $content;
    }
 
    /**
     * Split the preprocessed text into lines by 'n'
     * after replacing "rn", 'n', and 'r' with 'n'
     * @param string @rawText
     * @return void
     */
    function getTextLines( $rawText ) {
        // do some replacement
        $order = array( "rn", "n", "r" );
        $replace = 'n';
        $rawText = str_replace( $order, $replace, $rawText );
 
        $lines = explode( 'n', $rawText );
 
        foreach( $lines as $line ) {
            // remove the blanks in each line
            $tmp = preg_replace( '/s+/s', '', $line );
            $this->textLines[] = $tmp;
        }
    }
 
    /**
     * Calculate the blocks' length
     * @return void
     */
    function calBlocksLen() {
        $textLineNum = count( $this->textLines );
 
        // calculate the first block's length
        $blkLen = 0;
        for( $i = 0; $i < $this->blkSize; $i++ ) {
            $blkLen += strlen( $this->textLines[$i] );
        }
        $this->blksLen[] = $blkLen;
 
        // calculate the other block's length using Dynamic Programming method
        for( $i = 1; $i < ($textLineNum - $this->blkSize); $i++ ) {
            $blkLen = $this->blksLen[$i - 1] + strlen( $this->textLines[$i - 1 + $this->blkSize] ) - strlen( $this->textLines[$i - 1] );
            $this->blksLen[] = $blkLen;
        }
    }
 
    /**
     * Extract the text from the web page's source code
     * according to the simple idea:
     * [the text should be the longgest continuous content
     * in the web page]
     * @return string
     */
    function getPlainText() {
        $preProcText = $this->preProcess();
        $this->getTextLines( $preProcText );
        $this->calBlocksLen();
 
        $start = $end = -1;
        $i = $maxTextLen = 0;
 
        $blkNum = count( $this->blksLen );
        while( $i < $blkNum ) {
            while( ($i < $blkNum) && ($this->blksLen[$i] == 0) ) $i++;
            if( $i >= $blkNum ) break;
            $tmp = $i;
 
            $curTextLen = 0;
            $portion = '';
            while( ($i < $blkNum) && ($this->blksLen[$i] != 0) ) {
                $portion .= $this->textLines[$i];
                $curTextLen += $this->blksLen[$i];
                $i++;
            }
            if( $curTextLen > $maxTextLen ) {
                $this->text = $portion;
                $maxTextLen = $curTextLen;
                $start = $tmp;
                $end = $i - 1;
            }
        }
 
        echo ($start + 1) . '<br />' . ($end + 1) . '<br />';
        return $this->text;
    }
}
?>