PHP Classes

File: example.php

Recommend this page to a friend!
  Classes of Trev Tune   PHP Search Engine Crawler   example.php   Download  
File: example.php
Role: Example script
Content type: text/plain
Description: A simple example on how to crawl several pages for links.
Class: PHP Search Engine Crawler
Crawl pages and extract links, images and metadata
Author: By
Last change: Fixed bugs and added a new parameter (singledomain)
Date: 10 years ago
Size: 1,667 bytes
 

Contents

Class file image Download
<!Doctype html>
<html>
<head>
<title>My crawler</title>
</head>
<body>

<?php

//error_reporting(-1);

/*Example link crawler
*@package :Simple crawler
*author : Trev Tune
*/

include 'crawler.class.php';

 
/* @seenlinks
*
*A multidimentional array of all crawled links
*
*e.g print_r($seenlinks) may produce
array(
  [domain]=>array( [link1]=>link;
)
*/


$url=trim($_GET['url']);

$seenlinks=array();


crawl($url,2);


function
crawl($url,$depth=2,$singledomain=false)
{

$domain=host($url);

global
$seenlinks;

//Have we crawled into the specified depth

if ($depth==0)
 return;

//Have we crawled this url

if(isset($seenlinks[$domain][$url]))
return;

$crawler=new crawler($url);

if(!
$crawler)
return;

//$crawler->getLinks();

//Add to array


$seenlinks[$domain][$url]=$url;

$links=$crawler->getLinks();


foreach(
$links as $link=>$a)

{

//Does user want to crawl only a specific domain

if($singledomain)
{
if(
$domain=!host($link))
break;}


crawl($link,$depth-1,$singledomain);}
}

function
host($url)
{
/*
$host = parse_url($url);
$host = $host['host'];
*/

$host = str_ireplace('www.','', parse_url($url, PHP_URL_HOST));

return
$host;
}

if(
count($seenlinks)==0)
{die(
"No links found for $url");}

echo
"

<div class='crawler'>
Domains/subdomains found = "
. count($seenlinks) . "<br/>";


foreach(
$seenlinks as $domain=>$links){

echo
"<br/> Domain ".$domain. " has " . count($links) . "links <br>";

foreach(
$links as $link){
{
echo
" <br/> $link <br/>";

}

echo
"<hr/>";
}

echo
"<hr/>";
}

?>

</div>
</body>
</html>