",$d); for ($i=0;$i",$a[1]); $r[$a[0]] = $a[1]; } return $r; } #If we're here without any arguments, show a form if (!$url) { ?> Link Ripper

Link Ripper

This tool will reach out to a web page, strip out a chunk of that page's source between a start and the end string, strip out all the hyperlinks, and return them in Mike's Newsfeed format for immediate consumption.

Here's how it works:

  1. Enter the information into the form below
  2. Find two unique strings of text in the html source that you wish to steal links from. Get them as close to the set of links as you can.
  3. Click submit to check the feed's results. If it comes back blank, check your URLs and your start and end strings again.
  4. When you get a feed working, have a script local to YOUR website read the feed's URL each hour. DO NOT spam my server with requests every time a user hits your page, it will mean bad performance for your page and overtaxing my server.

This script is not elegant by any means. It will not work on some complex pages or pages with cgi query strings (?stuff=morestuff&somenumber=738902 in the URL). Lets give it a try...

URL of page to rip links from:


Name of the site you're ripping from:


Description you wish to include in the feed:


URL you wish to include in the feed:


Text to start parsing on:


Text to end parsing on:


Number of links you want in the feed:


Do relative links need absolute paths?

If so, what is the absolute URL you need to start from?:


$timecheck) { #file is new and cached $file = @join('', file("./cache/$urlcheck")); $cachecheck = 1; } } if (!$cachecheck) { $remoteurl = "http://".$url.""; # the following was modified on December 5th because the PHP file() quit retriving pages. $execute_fetch = exec("./fetch.pl $remoteurl"); # Fetch the file using PERL $file = @join('', file("./cache/temp.txt")); # Load the page data from a temp file created by the perl script. $localfilename = ereg_replace("http://","",$remoteurl); $localfilename = urlencode($localfilename); $filehandle = fopen("./cache/$localfilename","w+"); fwrite ($filehandle, $file); } # first fetch the page $page = $file; # $file = @join('', file("http://liquidtheater.com/")); # echo $file; # now filter out the bad stuff and keep the good stuff in reviewchunk ereg ("$start_string(.*)$end_string", $page, $linkchunk); # get rid of nasty array and go to nice simple var $linkchunk = $linkchunk[0]; # echo $linkchunk; # Filter out any non-link HTML characters $linkchunk = strip_tags($linkchunk, ""); #get back hyperlinks from function above $links = parse_hrefs($linkchunk); # start displaying an html file echo " $name http://$url $description "; # if we have links, print out the html file $count = -1; if (!$link_number) { $link_number = 9; } if ($links) { while(list($k,$v) = each($links)) { $link_number = $link_number - 1; if ($k && $v) { # strip out some junk off URLs $k = ereg_replace(" .*","",$k); # get rid of spaces $k = ereg_replace("\"","",$k); # get rid of extra quotes # check to see if relative url flag was selected, but do not fix urls that contain "http://" if ($fix_relative_urls && preg_match("/http:\/\//",$k) == 0) { $k = "$absolute_url$k"; # modified on 28 July 2003 to remove extra / } # $k = ereg_replace("\&.*;"," ",$k); # fix ampersands $v = ereg_replace("&","&",$v); # fix ampersands $v = ereg_replace("&","&",$v); # fix ampersands $k = ereg_replace("&","&",$k); # fix ampersands $k = ereg_replace("&","&",$k); # fix ampersands if ($count <= $link_number) { echo "\n$v\n$k\n\n\n"; } } } # echo "
· More...
\n"; } echo " "; } ?>