Saturday, June 29, 2013

Crawling a Website with VIEWSTATE & EVENTVALIDATION using PHP

It was really a tough job when I tried to make automated requests to a site using VIEWSTATE & VIEWARGUMENTS which is using I think ASP .Net. To make a request to any page, we need to send with the request the __VIEWSTATE, __VIEWARGUMENT hidden values along with the next request. This will change with every page you get it.

In short unlike HTTP stateless protocol, these two variables make remember the cgi-script of the previous state that client must have been to fetch data of current state.

You should also send all POST hidden variables too otherwise the request could fail.

There is one useful function 'exfield' which will extract any passed hidden field's value and there is 'sendpost' method which will send to the url all the arguments in the passed array using POST method and return you the result.

I hope it is of some help to you!

<?php
define('URL', 'http://www.example.com');  //The website using .Net ASP
define('PAT_RESULTS_FOUND', '/Search result:([0-9]+) Results found/');
define('TOTAL_officeofficeS_IN_PAGE', 10); //max Select01 like links in a page
//................................................................
$total_internet_requests = 0;
assert_options(ASSERT_CALLBACK, 'my_assert_handler'); $pppcodearr = array('1', '2');
foreach ($pppcodearr as $pppcode) {
$not_found_file = $pppcode . "not-found";

if(file_exists($not_found_file))
{
continue; //skip this ppp-code
}

$content = sendget(); //the very first page
$fields = array(
'ddl_dist' => '341',
'ddl_state' => '1',
'hdn_tabchoice' => '1',
'search_on' => 'Search',
'txt_dist_on' => '',
'txt_offname' => $pppcode,
//'__EVENTARGUMENT' => 'Page$3',
'__EVENTTARGET' => 'ggg',
'txt_stateon' => '',
); $exflds = array('__VIEWSTATE', '__EVENTVALIDATION');
foreach ($exflds as $val) {
$fields[$val] = exfield($val, $content);
}
$fields['__VIEWSTATEENCRYPTED'] = ''; $content = sendpost($fields); //this is Page$1 assert(checkvalidpage($content));
$total_recs = get_total_results_found($content);


if($total_recs == 0)
{
//indicate no records for this pppcode
assert(file_put_contents($not_found_file, ''));
continue;
}

$total_pages = ceil($total_recs / TOTAL_officeofficeS_IN_PAGE); $page_no = 1; $post_offices_in_page = $post_offices_in_page_remaining = officeofficesinpage($content);
//if it is the first page then check if all records have already been downloaded
$total_recs_ctr = $total_recs;
$not_exists = false;
for ($pg = 1; $pg <= $total_pages; ++$pg) {
$sel = -1;
do {
++$sel;
--$total_recs_ctr; $file = coin_ppprecord_filename($pppcode, $pg, $sel); if (file_exists($file) && checkvalidpage(file_get_contents($file))) {
//skip it
if (dbg()) {
print "$file already exists ... skipping\n";
}
} else {
$not_exists = true;
break 2;
}
} while ($total_recs_ctr && $sel < TOTAL_officeofficeS_IN_PAGE - 1);
} if ($not_exists) //if at least 1 records does not exist then only enter this loop.
do {
//this the Page$1
wrt($content); if (!checkvalidpage($content)) {
break;
} $fields = array(
'ddl_dist' => '0',
'ddl_state' => '1',
'hdn_tabchoice' => '1',
'txt_dist_on' => '',
'txt_offname' => $pppcode,
'__EVENTARGUMENT' => 'Select$0',
'__EVENTTARGET' => 'ggg',
'txt_stateon' => '',
'__VIEWSTATEENCRYPTED' => '',
); foreach ($exflds as $val) {
$fields[$val] = exfield($val, $content);
//print "$val= $fields[$val] \n";
} for ($sel = 0; $post_offices_in_page_remaining--; ++$sel) {
$fields['__EVENTARGUMENT'] = 'Select$' . $sel; $file = coin_ppprecord_filename($pppcode, $page_no, $sel); if (file_exists($file) && checkvalidpage(file_get_contents($file))) {
//skip it
if (dbg()) {
print "$file already exists ... skipping\n";
}
} else {
$result = sendpost($fields);
if (checkvalidpage($result)) {
assert(file_put_contents($file, $result));
} else {
print "Is not valid page found for $file\n";
print " $sel < $post_offices_in_page $page_no\n";
assert(true);
}
}
}
//go over to the next page
++$page_no; $fields = array(
'ddl_dist' => '0',
'ddl_state' => '1',
'hdn_tabchoice' => '1',
'txt_dist_on' => '',
'txt_offname' => $pppcode,
'__EVENTARGUMENT' => getpageno($page_no),
'__EVENTTARGET' => 'ggg',
'txt_stateon' => '',
'__VIEWSTATEENCRYPTED' => '',
); foreach ($exflds as $val) {
$fields[$val] = exfield($val, $content);
}
$content = sendpost($fields);
} while ($page_no <= $total_pages);
}//for each
print "Total internet page requests = $total_internet_requests\n"; function dbg() {
return 1;
} function my_assert_handler($file, $line, $code) {
echo "<hr>Assertion Failed:
File '$file'<br />
Line '$line'<br />
Code '$code'<br /><hr />"; var_dump(debug_backtrace());
exit(1);
} function get_total_results_found($content) {
if (strstr($content, 'No Matched Post offices found')) {
return 0;
} else if (preg_match(PAT_RESULTS_FOUND, $content, $matches)) {
if (dbg()) {
print "total pppcode results=$matches[1]\n";
}
return $matches[1];
} else {
assert(true); //can't reach here
}
} //count number of officeoffice link in the page
function officeofficesinpage($content) {
//The look like javascript:__doPostBack(&#39;ggg&#39;,&#39;Select$[0-9]{1,2} $pat = '/javascript:__doPostBack\(&#39;ggg&#39;,&#39;Select\$[0-9]{1,2}/'; wrt($content); $ret = preg_match_all($pat, $content, $matches); assert($ret !== FALSE); return $ret;
} function getpageno($page) {
return 'Page$' . $page;
} function getselno($sel) {
return 'Select$' . $sel;
} function checkvalidpage($content) {
if (strlen($content) < 65000 || strstr($content, 'Sorry this site has encountered a serious problem, please try reloading the page')) {
return false;
} else {
return true;
}
} //extract value of a hidden field
function exfield($field, $content) {
$pat = '{<input\s+type="hidden"\s+name="' . $field . '".*?value="([^"]+)"}'; if (preg_match($pat, $content, $match)) {
return $match[1];
} else {
print("Unable to extract $field\n");
}
} function wrt($content) {
file_put_contents("F:/tmp/a.htm", $content);
} function sendget() {
global $total_internet_requests;
$ch = curl_init(URL);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_HEADER, 0);
$txResult = curl_exec($ch);
$statuscode = curl_getinfo($ch, CURLINFO_HTTP_CODE);

++$total_internet_requests; if (dbg() >= 2) {
print "statuscode=$statuscode\n";
print "Result=$txResult\n";
}
assert(file_put_contents("F:/tmp/abc.htm", $txResult) !== FALSE);
curl_close($ch);
return $txResult;
} function sendpost($postarr) {
global $total_internet_requests;
$data = '';
foreach ($postarr as $key => $val) {
$unit = "$key=" . urlencode($val);
if (strlen($data) == 0) {
$amp = '';
} else {
$amp = '&';
} $data .= "$amp$unit";
} $custom_headers = array();
$custom_headers[] = "Accept: text/html, application/xhtml+xml, application / xml;q=0.9, */* ;q=0.8";
$custom_headers[] = "Pragma: no-cache";
$custom_headers[] = "Cache-Control: no-cache";
$custom_headers[] = "Accept-Language: en-us;q=0.7,en;q=0.3";
$custom_headers[] = "Accept-Charset: utf-8,windows-1251;q=0.7,*;q=0.7";
$ch = curl_init();
$useragent = "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:9.0.1) Gecko/20100101 Firefox/9.0.1";
curl_setopt($ch, CURLOPT_USERAGENT, $useragent); // set user agent
curl_setopt($ch, CURLOPT_URL, URL); if (strlen($data)) {
curl_setopt($ch, CURLOPT_POSTFIELDS, $data);
curl_setopt($ch, CURLOPT_POST, 1);
}
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
curl_setopt($ch, CURLOPT_HEADER, false);
curl_setopt($ch, CURLOPT_HTTPHEADER, $custom_headers); curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 20);
curl_setopt($ch, CURLOPT_TIMEOUT, 40); //timeout in seconds $txResult = curl_exec($ch);

++$total_internet_requests; $statuscode = curl_getinfo($ch, CURLINFO_HTTP_CODE);
curl_close($ch);
if (dbg() >=2 ) {
print "statuscode=$statuscode\n";
print "Result=$txResult\n";
}
if (dbg()) {
assert(file_put_contents(tempnam(get_temp_dir() . "pppcode", "post_req"), $txResult) !== FALSE);
}
return $txResult;
} function get_temp_dir() {
return "f:/tmp/";
} function coin_ppprecord_filename($pppcode, $page_no, $sel) {
return get_temp_dir() . "pppcode/" . "$pppcode-" . getpageno($page_no) . '-' . getselno($sel) . ".htm";
}