天涯用户名采集

0

Posted by admin | Posted in php, 产品 | Posted on 23-07-2010

两年前的一个采集程序。

<?php
/*
*Desc:extract author name from http://www.tianya.cn/
*Author:digmouse
*QQ:283685878
*date:2008-8-15 in shanghai
*/
error_reporting(E_ALL);
$url=”http://www.tianya.cn/new/publicForum/ArticlesList.asp?PageNo=1&strItem=fans&Part=0&NextArticle=2008-12-4+8%3A21%3A40&strSubItem=&strSubItem2=”;
//conn_db();//connect db
get_ty($url);//extract data
function get_ty($url)
{
$html=file_get_contents($url);
$html_1=$html;
//get nextpage url
/*
$nexturl=get_nextpage($html);
if(!$res)
{
get_ty($nexturl);
}
*/
echo htmlspecialchars($html);
$step_1=”";
$step_2=$step_1;
$start_1=mb_strpos($html,$step_1);
if($start_1===false)
{
echo ‘not find string ‘.$step_1.’ in step_1′;
exit;
}
$len_1=mb_strlen($step_1);
$html=mb_substr($html,$start_1+$len_1);
//echo htmlspecialchars($html);
/*
$end_1=mb_strpos($html,$step_2);
if($end_1===false)
{
echo ‘not find string ‘.$step_2.’ in step_2′;
exit;
}
$html=mb_substr($html,0,$end_1-1);
echo htmlspecialchars($html);
*/
$step_3=”<td width=90>”;
$step_4=”</td>”;
while(mb_strpos($html,$step_3))
{
$start_3=mb_strpos($html,$step_3);
if($start_3===false)
{
echo ‘not find string ‘.$step_3 .’ in step_3′;
exit;
}
$len_3=mb_strlen($step_3);
$html=mb_substr($html,$start_3+$len_3);
$end_3=mb_strpos($html,$step_4);
$name=mb_substr($html,0,$end_3);
$name= strip_tags($name);
$sql=”insert name(name) values(‘”.$name.”‘);”;
echo $sql;
$res=mysql_query($sql);
if($res===false)
{
echo ‘query sql failed ‘.mysql_error();
exit;
}
}
$nexturl= get_nextpage($html_1);
if($nexturl!=false)
{
get_ty($nexturl);
}
else
{
echo ‘-_-  ok’;
}
}
function get_nextpage($html)
{
$step_4=”??”;
$start_4=mb_strpos($html,$step_4);
if($start_4===false)
{
echo ‘not find string ‘.$step_4.’ in step_4′;
return false;
exit;
}
$len_4=mb_strlen($step_4);
$html=mb_substr($html,0,$start_4+$len_4);
$step_5=”</A>”;
$start_5=mb_strrpos($html,$step_5);
if($start_5===false)
{
echo ‘not find string ‘.$start_5.’ in step_5′;
return false;
exit;
}
$html=mb_substr($html,$start_5);
$step_6=”<a href=”;
$step_7=”>”;
if($start_6===false)
{
echo ‘not find string ‘.$start_5.’ in step_6′;
return false;
exit;
}
$len_6=mb_strlen($step_6);
$html=mb_substr($html,$start_6+$len_6);
$end_7=mb_strpos($html,$step_7);
$nexturl=mb_substr($html,6,$end_7-6);
$nexturl=”http://www.tianya.cn/”.$nexturl;
$nexturl=trim($nexturl);
//echo $nexturl;
return $nexturl;
}
function conn_db()
{
$ip=”localhost”;
$user=”root”;
$pwd=”chenguokui”;
$db=”test”;
$conn=mysql_connect($ip,$user,$pwd) or die(‘connect db failed’);
mysql_select_db($db) or die(‘conn db failed’);
return $conn;
}
?>

Comments are closed.