<?php
/*
Copyright (C) 2005 Sam J. Clarke
All rights reserved.
+-------------------------------------------+
| |
| PHP Robot Class |
| |
+-------------------------------------------+
| |
| Author Name: Sam J. Clarke |
| Author Email: admin@free-php.org.uk |
| Author URI: http://www.free-php.org.uk/ |
| Description: This script is a robot class |
| to help you build web robots. |
| |
+-------------------------------------------+
| |
| If you like this, Please link back to us. |
| |
+-------------------------------------------+
LICENSE
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License (GPL)
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
To read the license please visit http://www.gnu.org/copyleft/gpl.html
*/
class Robot {
var $Agent = '-'; // user agent
var $temp_everything = false; // stores what's sent back
// gets the status code returned
// returns false on fail and status code on sucsess
// you must call the get everything function first
function GetStatus()
{
$html = $this->temp_everything; // gets what was sent back
if (!$html) // check it's not false
{
return false; // if it is return false
}
$pieces = preg_split("/(\r\n\r\n|\r\r|\n\n)/", $html, 2); // split the HTML from the headers
$headers = preg_split("/(\r\n|\n|\r)/", $pieces [0]); // save the headers
unset($pieces); // unset everything else
for($i=0 ;isset($headers[$i]);$i++)
{
// search for the status code header
if (preg_match("/HTTP\/[0-9A-Za-z +]/i", $headers[$i]))
{
// replace everything but the status code
$status = preg_replace("/http\/[0-9]\.[0-9] /i", '', $headers[$i]);
}
}
return $status; // return the status code
}
// gets everything from the url and stores it in a string
// returns false on fail true on sucsess
function GetEverything($url)
{
$info = @parse_url ($url); // parse the url
$fp = @fsockopen($info["host"], 80, $errno, $errstr, 10); // open a socket
if (!$fp) // check it worked
{
return false; // if it didn't return false
}
else
{
if (empty($info['path'])) // if the path is empty
{
$info['path'] = '/'; // then set the path to /
}
if (isset($info["query"])) // check if there is a query string
{
$query = '?' .$info["query"]; // if there is get it ready to use
}
else
{
$query = ''; // if not make an empty string
}
// HTTP headers to send
$out = "GET ".$info[ "path"]."".$query." HTTP/1.0\r\n" ;
$out .= "Host: ".$info[ "host"]."\r\n";
$out .= "Connection: close \r\n";
$out .= "User-Agent: ". $this->Agent."\r\n\r\n";
fwrite ( $fp, $out ); // write the HTTP headers to the socket
$html = ''; // make an empty string to store them in
while (!feof($fp)) // while not end of socket
{
$html .= fread($fp, 8192); // read from the socket and add it to the string
}
fclose($fp); // close the socket
$this->temp_everything = $html; // save the string
return true; // return true
}
}
// returns what was read from the socket
// returns everything that was read from the socket or false
// you must call the get everything function first
function ReturnEverything()
{
$html = $this->temp_everything; // gets what was read from the socket
return $html;
}
// gets an array of urls from the web page
// returns an array of urls or false on fail
// you must call the get everything function first
function GetUrls($url)
{
$info = @parse_url($url); // parse the url
$html = $this->temp_everything; // gets what was sent back
if (!$html) // check it's not false
{
return false; // if it is return false
}
$pieces = preg_split ("/(\r\n\r\n|\r\r|\n\n)/", $html, 2); // split the HTML from the headers
$html = $pieces[1]; // save the HTML
unset($pieces); // unset everything else
// find all the urls
preg_match_all("|href\=\"?'?`?([[:alnum:]:?=&@/;._-]+)\"?'?`?|i", $html, &$matches);
$links = array(); // make an array to store them in
$ret = $matches[1];
for($i=0;isset($ret[$i]);$i++)
{
// if it starts with http:// save it without editing
if(preg_match("|^http://(.*)|i",$ret[$i]))
{
$links[] = $ret[$i];
}
// if it matches /place.html
elseif(preg_match("|^/(.*)|i",$ret[ $i]))
{
// add it to the host name and save it
$links[] = 'http://'.$info["host"].''.$ret[$i];
}
// if it maches mailto:
elseif(preg_match("/^mailto:(.*)/i",$ret[$i]))
{
// could save email addresses here
}
}
return $links ; // return the array of links
}
// gets the headers returned
// returns false on fail headers on sucsess
// you must call the get everything function first
function GetHeaders()
{
$html = $this->temp_everything; // gets what was sent back
if (!$html) // check it's not false
{
return false; // if it is return false
}
$pieces = preg_split("/(\r\n\r\n|\r\r|\n\n)/", $html, 2); // split the HTML from the headers
return $pieces[0]; // return the headers
}
// gets the html of a page
// returns false on fail HTML on sucsess
// you must call the get everything function first
function GetHTML()
{
$html = $this->temp_everything; // gets what was sent back
if (!$html) // check it's not false
{
return false; // if it is return false
}
$pieces = preg_split("/(\r\n\r\n|\r\r|\n\n)/", $html, 2); // split the HTML from the headers
return $pieces[1]; // return the HTML
}
// Gets the text of a web page
// returns false on fail text on sucsess
// you must call the get everything function first
function GetTEXT()
{
$html = $this->temp_everything; // gets what was sent back
if (!$html) // check it's not false
{
return false; // if it is return false
}
$pieces = preg_split("/(\r\n\r\n|\r\r|\n\n)/", $html, 2); // split the HTML from the headers
// strip the HTML off and just leave text
$html = preg_replace('@<script[^>]*
?>
'."\n"; // echo out the HTML from free-php.org.uk echo 'HTML from free-php.org.uk:' . $robot->GetHTML() . '
'."\n"; // echo out the Headers from free-php.org.uk echo 'Headers from free-php.org.uk:' . $robot->GetHeaders() . '
'."\n"; // echo out the urls from free-php.org.uk echo 'Array of urls from free-php.org.uk:' . print_r($robot->GetUrls('http://www.free-php.org.uk/')) . '
'."\n"; // echo out everything from free-php.org.uk echo 'Everything from free-php.org.uk:' . $robot->ReturnEverything() . '
'."\n"; // echo out the status code from free-php.org.uk echo 'The status code from free-php.org.uk:' . $robot->GetStatus() . '
'."\n"; ?>