// a simple web crawler example
// v1.1
//
$toplink = "http://edmondhui.homeip.net/blog";
$depth = 2;
walkIt($toplink,1);
function walkIt($link, $level) {
global $depth;
if ($level <= $depth) {
$content = file($link);
if ($content != false) {
$content = html_entity_decode(implode($content));
//
// IF NEED, PAGE CONTENTS PROCESS AND SAVE TO DB HERE
//
preg_match_all("/href=\".*?\"/i", $content, $matches);
$matches = array_unique($matches[0]);
foreach ($matches as $idx => $url) {
$url = substr($url, 6, -1);
if (strstr($url, "http://") != false) {
for ($i = 0; $i < $level; $i++) { echo ' '; }
echo "$url\n";
walkIt($url, $level+1);
}
}
} else {
echo "failed to crawle - $link\n";
}
}
}
?>
Next exercise, write this in python!