Помогите с парсером

Сбор и анализ всего что можно собрать из сети.
Ответить
kaputto
Сообщения: 1
Зарегистрирован: 01 дек 2011, 13:52

Помогите с парсером

Сообщение kaputto » 01 дек 2011, 14:06

Вот код 1:
<?php
// The following code is required to properly run XWeb Human Emulator
require("C:\XWeb\Human Emulator\Templates/xweb_human_emulator.php");

$slmin=10; // ??????????? ????????
$slmax=15; // ???????????? ????????
$slban=300; // ???????? ????? ????

$key = ""; // ???? ?????????
$tmpimg = "C:\XWeb\parser-go\captcha.jpeg"; // ?????, ???? ????? ??????????? ?????

$qwery = file("key.txt");

echo "<h2>???????? ??????? ".count($qwery)." ????????</h2>";

for ($i = 0; $i < (count($qwery)); $i++) {

$browser->clear_cash();
$browser->clear_cookies("");
$browser->recreate();

$browser->enable_images(1,true);
$browser->enable_java_script(1);
$browser->enable_activex(1);
$browser->enable_video(0);
$browser->enable_sounds(0);
$browser->enable_java(1);
$browser->enable_frames(1);
$browser->enable_popup(0);
$browser->enable_quiet_regime(1);
$browser->disable_script_error(1);
$browser->enable_cache(1);
$browser->enable_browser_message_boxes("false");

$uagents=file("user-agent.txt");
$browser->set_user_agent($uagents[rand(0,count($uagents)-1)]);

echo "<small><b>?????? ".($i+1)." ?????? <u>".$qwery[$i]."</u></b></small><br />";

if ($i!==0) {
$realsl = mt_rand($slmin, $slmax);
echo "<small><i>???????? ".$realsl." ??????...</i></small><br />";
sleep ($realsl);
}

for ($p = 0; $p < 101; $p++) {

if ($p!==0) {
$realsl = mt_rand($slmin, $slmax);
echo "<small><i>???????? ".$realsl." ??????...</i></small><br />";
sleep ($realsl);
}

if ($p==0) $myurl = "http://www.google.ru/search?as_q=".urle ... 0&filter=0";
else $myurl = "http://www.google.ru/search?as_q=".urle ... "&filter=0";

if ($p==0) $myurl = "http://www.google.ru/search?num=100&hl= ... ($qwery[$i]);
else $myurl = "http://www.google.ru/search?q=".urlenco ... "&filter=0";

if ($p==0) $myurl = "http://www.google.com/search?hl=com&new ... ($qwery[$i]);
else $myurl = "http://www.google.com/search?q=".urlenc ... "&filter=0";

$browser->navigate($myurl);
$browser->wait_for(60,1);

sleep (100);

/* ???? ????? */

for ($? = 0; $? < 3; $?++) {

if ($input->is_exist_with_name("captcha")===true) {
$image->save_to_file_by_number(0,$tmpimg);
echo "<small><i>?????... ??????????</i></small><br>";
$captcha = $image->recognize_by_anticaptcha("","captcha.jpeg",$key,"http://antigate.com");
echo "<br />";
$input->set_value_by_name("captcha",$captcha);
$button->click_by_name('submit');
$browser->wait_for(60,1);
sleep(1);
}
else break;

}

if (stristr($webpage->get_source(), iconv("Windows-1251", "UTF-8", "<title>???????? ???? ?????????"))) {
echo "<small><font color=\"#FF0000\">????????? ?? ".($p+1)." ????????</font></small><br />";
echo "<small><i>???????? ".$slban." ??????...</i></small><br />";
$p--;
continue;
}

$fp=fopen("result.txt","a+");
fputs($fp,$anchor->get_all_urls());
fclose($fp);

echo "<small><u>???????? ???????? ".($p+1)."</u></small><br />";

if (!stristr($webpage->get_source(), iconv("Windows-1251", "UTF-8", "?????????</span></a></table></div>")) AND !stristr($webpage->get_source(), iconv("Windows-1251", "UTF-8", "?????????</a></table></div>"))) {
echo "<small><b>??????? ?????? ???</b></small><br />";
break;
}

}

echo "<br />";

}



$content = file_get_contents("result.txt");
$content = explode("<br>", $content);

for ($i = 0; $i < (count($content)); $i++) {
if (stristr($content[$i], "yandex") OR stristr($content[$i], "google.com") OR stristr($content[$i], "google.ru") OR stristr($content[$i], "googleusercontent.com") OR stristr($content[$i], "youtube.com")) {}
else {
$goodurl[] = $content[$i];
}
}

$goodurl = array_unique($goodurl);
$goodurl = implode("\r\n", $goodurl);

$fp=fopen("result.txt","w+");
fputs($fp,$goodurl);
fclose($fp);

echo "<h2>?????????!</h2>";

// Quit
$app->quit();
?>
и код 2:
<?php

$xhe_host ="127.0.0.1:7011";

// The following code is required to properly run XWeb Human Emulator
require("C:\XWeb\Human Emulator\Templates/xweb_human_emulator.php");

function randline($file){
$lines = file($file);
$element = array_shift($lines);
file_put_contents ( $file, $lines);
return $element;
}
$ac='keynah'; //???? ?? ?????????
$min=3; //???. ???????? ??? ????????
$max=4;//????. ???????? ??? ????????
$keys='C:\key.txt';//???? ? ???????
$links='C:/links.txt';//???? ??? ?????? ????????? ??????
$dopkeys='';//???????? ????????? ?? ???? ??????, ????? ???????? ??????
for ($j=0;$j<50000;$j++){

$key=randline($keys);
if ($key=="") break;
trim($key);
$key=$key.$dopkeys;
str_replace(" ", "+",$key);
$browser->navigate("http://www.google.com/search?q=$key&num ... art=0&sa=N");
$browser->wait_for(60,1);
$a=$webpage->get_source();

if(strpos($a, 'To continue, please type the characters below:')==true){
$image->save_to_file_by_number(0,'C:\tmp.jpeg');
$browser->wait_for(60,1);
$captcha= $image->recognize_by_anticaptcha("", "C:/tmp.jpeg",$ac,"http://antigate.com/");
$browser->wait_for(60,1);
$input->set_value_by_name('captcha',$captcha);
$button->click_by_name('submit');
}

preg_match_all("/(?<=h3 class\=\"r\"\>\<a href\=\").*?(?=\" class\=l)/", $a, $mat);
$mat1=implode("\r\n", $mat[0]);
$file=fopen($links,'a');
fwrite($file, $mat1);
fclose($file);
sleep(rand($min,$max));

for($w=0;$w<100;$w++){
if($anchor->click_by_inner_text('Next ',true)==true){
$browser->wait_for(60,1);
$a=$webpage->get_source();

if(strpos($a, 'To continue, please type the characters below:')==true){
$image->save_to_file_by_number(0,'C:\tmp.jpeg');
$browser->wait_for(60,1);
$captcha= $image->recognize_by_anticaptcha("", "C:/tmp.jpeg",$ac,"http://antigate.com/");
$browser->wait_for(60,1);
$input->set_value_by_name('captcha',$captcha);
$button->click_by_name('submit');
}

preg_match_all("/(?<=h3 class\=\"r\"\>\<a href\=\").*?(?=\" class\=l)/", $a, $mat);
$mat1=implode("\r\n", $mat[0]);
$file=fopen($links,'a');
fwrite($file, $mat1);
fclose($file);
sleep(rand($min,$max));
}
else break;
}
}
// Quit
$app->quit();
?>
Первый парсит только первую страницу, второй запускается и за секунду останавливается. Помогите разобратся, я пока нуб в етих вопросах.

Ответить