php采集页面主文本内容(非正则抓取,采用直接提取)
2017-04-09
后端
<?php
//打印页面的编码设置
header("Content-type: text/html; charset=utf-8");
//设置服务器采集响应时间
set_time_limit( 60 * 15 );
//内容保存文件夹
$sourceDir = 'pageFile/';
//内容输出文件夹
$textDir = 'pageText/';
//新建文件夹
if(!is_dir($textDir) || !is_dir('mid') || !is_dir('line_len') || !is_dir('bl_size'))
{
mkdir( $textDir );
mkdir( 'mid' );
mkdir( 'line_len' );
mkdir( 'bl_size' );
}
//采集的网站
$qianzui = 'http://www.thinkphp.cn';
//采集的目标网址页(列表页)
$url = 'http://www.thinkphp.cn/code/index.html';
//获取目标页所有的a链接
$gather = new gather();
//获取目标页内容
$html = $gather->geturlfile($url);
// 确定源编码
$encoding = mb_detect_encoding($html, array("ASCII", "UTF-8", "GB2312", "GBK", "BIG5"));
// 转码
if($encoding != "UTF-8")
$html = mb_convert_encoding($html, "UTF-8", $encoding);
//定义采集列表区间
$start = '<div class="box extend">';
$end = '/教程列表 -->';
//获取区间内的文章URL和TITLE
$code = $gather->get_sub_content($html, $start, $end);
$newsAry = $gather->get_all_url($code);
//打印出结果
//$gather->vd($newsAry);
//组合完整详细页url
$url_ary = array();
foreach($newsAry['url'] as $v)
{
if(!substr_count($v,'functions') && !substr_count($v,'system') && !substr_count($v,'examples') && !substr_count($v,'#'))
$url_ary[] = $qianzui . $v;
}
//P($url_ary);
//die();
//要采集的详细页总数
$fileNum = count($url_ary);
$BL_BLOCK = 3;
//require_once( 'class.textExtract.php' );
//提取详细页内容
for( $j = 0; $j <= $fileNum; $j++ ) {
// 获得网页内容
$content = file_get_contents( $url_ary[$j] );
//实例化提取类
$iTextExtractor = new textExtract( $content, $BL_BLOCK );
$text = $iTextExtractor->getPlainText();
$lineNum = count( $iTextExtractor->textLines );
echo $lineNum . '<br />';
// 输出预处理后得到的结果
$midFileName = 'mid/mid_' . $j . '.txt';
$fod = fopen( $midFileName, "w" );
foreach( $iTextExtractor->textLines as $line )
{
fprintf( $fod, "%sn", $line );
}
fclose( $fod );
// 输出经过预处理后每行的长度
$lineLenFileName = 'line_len/line_len_' . $j . '.txt';
$fod = fopen( $lineLenFileName, "w" );
foreach( $iTextExtractor->textLines as $line )
{
fprintf( $fod, "%sn", strlen($line) );
}
fclose( $fod );
// 输出行块内容的长度
$blSizeFileName = 'bl_size/bl_size_' . $j . '.txt';
$fptr = fopen( $blSizeFileName, "w" );
foreach( $iTextExtractor->blksLen as $blkLen )
{
fprintf( $fptr, "%dn", $blkLen );
}
fclose( $fptr );
// 输出正文内容
$textFileName = $textDir . $j . '.html';
$fod = fopen( $textFileName, "w" );
fprintf( $fod, "%sn", $text );
fclose( $fod );
}
/**
* 页面内容采集类
* @author Milkcy QQ:9877633
* @copyright (C) 2012-2015 TCCMS.COM
* @lastmodify 2012-07-10 14:00
*/
class gather {
public $pagestring = '';
private $db;
function __construct() {
global $db;
$this->db = $db;
}
function geturlfile($url) {
$url = trim($url);
$content = '';
if (extension_loaded('curl')) {
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, 1);
curl_setopt($ch, CURLOPT_HEADER, 0);
$content = curl_exec($ch);
curl_close($ch);
} else {
$content = file_get_contents($url);
}
return trim($content);
}
function get_all_url($code) {
preg_match_all('/<a.+?href=["|']?([^>"' ]+)["|']?s*[^>]*>([^>]+)</a>/is', $code, $arr);
return array('name' => $arr[2], 'url' => $arr[1]);
}
function get_sub_content($str, $start, $end) {
$start = trim($start);
$end = trim($end);
if ($start == '' || $end == '') {
return $str;
}
$str = explode($start, $str);
$str = explode($end, $str[1]);
return $str[0];
}
function vd($var) {
echo "<div style="border:1px solid #ddd;background:#F7F7F7;padding:5px 10px;">rn";
echo "<pre style="font-family:Arial,Vrinda;font-size:14px;">rn";
var_dump($var);
echo "rn</pre>rn";
echo "</div>";
}
}
//内容提取封装类
# Script - class.textExtract.php
/**
* textExtract - text extraction class
* Created on 2010-08-10
* author: Wenfeng Xuan
* Email: wfxuan@insun.hit.edu.cn
* Blog: http://hi.baidu.com/xwf_like
*/
class textExtract {
///////////////////////////////////
// MEMBERS
///////////////////////////////////
/**
* record the web page's source code
* @var string
*/
public $rawPageCode = '';
/**
* record the text after preprocessing
* @var array
*/
public $textLines = array();
/**
* record the length of each block
* @var array
*/
public $blksLen = array();
/**
* record the final extracted text
* @var string
*/
public $text = '';
/**
* set the size of each block ( regards how many single lines as a block )
* it is the only parameter of this method
* @var int
*/
public $blkSize;
///////////////////////////////////
// METHODS
///////////////////////////////////
/**
* Set the value of relevant members
* @param string $_rawPageCode
* @param int $_blkSize
* @return void
*/
function __construct( $_rawPageCode, $_blkSize = 3 ) {
$this->rawPageCode = $_rawPageCode;
$this->blkSize = $_blkSize;
}
/**
* Preprocess the web page's source code
* @return string
*/
function preProcess() {
$content = $this->rawPageCode;
// 1. DTD information
$pattern = '/<!DOCTYPE.*?>/si';
$replacement = '';
$content = preg_replace( $pattern, $replacement, $content );
// 2. HTML comment
$pattern = '/<!--.*?-->/s';
$replacement = '';
$content = preg_replace( $pattern, $replacement, $content );
// 3. Java Script
$pattern = '/<script.*?>.*?</script>/si';
$replacement = '';
$content = preg_replace( $pattern, $replacement, $content );
// 4. CSS
$pattern = '/<style.*?>.*?</style>/si';
$replacement = '';
$content = preg_replace( $pattern, $replacement, $content );
// 5. HTML TAGs
$pattern = '/<.*?>/s';
$replacement = '';
$content = preg_replace( $pattern, $replacement, $content );
// 6. some special charcaters
$pattern = '/&.{1,5};|&#.{1,5};/';
$replacement = '';
$content = preg_replace( $pattern, $replacement, $content );
return $content;
}
/**
* Split the preprocessed text into lines by 'n'
* after replacing "rn", 'n', and 'r' with 'n'
* @param string @rawText
* @return void
*/
function getTextLines( $rawText ) {
// do some replacement
$order = array( "rn", "n", "r" );
$replace = 'n';
$rawText = str_replace( $order, $replace, $rawText );
$lines = explode( 'n', $rawText );
foreach( $lines as $line ) {
// remove the blanks in each line
$tmp = preg_replace( '/s+/s', '', $line );
$this->textLines[] = $tmp;
}
}
/**
* Calculate the blocks' length
* @return void
*/
function calBlocksLen() {
$textLineNum = count( $this->textLines );
// calculate the first block's length
$blkLen = 0;
for( $i = 0; $i < $this->blkSize; $i++ ) {
$blkLen += strlen( $this->textLines[$i] );
}
$this->blksLen[] = $blkLen;
// calculate the other block's length using Dynamic Programming method
for( $i = 1; $i < ($textLineNum - $this->blkSize); $i++ ) {
$blkLen = $this->blksLen[$i - 1] + strlen( $this->textLines[$i - 1 + $this->blkSize] ) - strlen( $this->textLines[$i - 1] );
$this->blksLen[] = $blkLen;
}
}
/**
* Extract the text from the web page's source code
* according to the simple idea:
* [the text should be the longgest continuous content
* in the web page]
* @return string
*/
function getPlainText() {
$preProcText = $this->preProcess();
$this->getTextLines( $preProcText );
$this->calBlocksLen();
$start = $end = -1;
$i = $maxTextLen = 0;
$blkNum = count( $this->blksLen );
while( $i < $blkNum ) {
while( ($i < $blkNum) && ($this->blksLen[$i] == 0) ) $i++;
if( $i >= $blkNum ) break;
$tmp = $i;
$curTextLen = 0;
$portion = '';
while( ($i < $blkNum) && ($this->blksLen[$i] != 0) ) {
$portion .= $this->textLines[$i];
$curTextLen += $this->blksLen[$i];
$i++;
}
if( $curTextLen > $maxTextLen ) {
$this->text = $portion;
$maxTextLen = $curTextLen;
$start = $tmp;
$end = $i - 1;
}
}
echo ($start + 1) . '<br />' . ($end + 1) . '<br />';
return $this->text;
}
}
?>