Created
December 20, 2016 01:47
-
-
Save aaronpk/786c84682056bcbb5ebdd3d4932d9199 to your computer and use it in GitHub Desktop.
Given an input URL, find the canonical URL after following redirects and looking at rel=canonical
<?php | |
if(!isset($_GET['url'])) { | |
?> | |
<form action="" method="get"> | |
<input type="url" name="url"> | |
<input type="submit" value="Go"> | |
</form> | |
<? | |
die(); | |
} | |
header('Content-type: text/plain'); | |
// Given an input URL, find the canonical URL, after following redirects and looking for rel=canonical in the source HTML | |
$url = $_GET['url']; | |
$ch = curl_init($url); | |
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true); | |
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, true); | |
curl_setopt($ch, CURLOPT_MAXREDIRS, 10); | |
// Some sites don't like crawlers, so pretend to be a browser | |
curl_setopt($ch, CURLOPT_HTTPHEADER, [ | |
'Accept: text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', | |
'User-Agent: Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36' | |
]); | |
$body = curl_exec($ch); | |
$final_url = curl_getinfo($ch, CURLINFO_EFFECTIVE_URL); | |
if($final_url) | |
$url = $final_url; | |
// Check for rel=canonical | |
if($body) { | |
$dom = load_html($body); | |
if($dom) { | |
$links = $dom->getElementsByTagName('link'); | |
foreach($links as $link) { | |
$rels = []; | |
if($link->hasAttribute('rel') && ($relAtt = $link->getAttribute('rel')) !== '') { | |
$rels = preg_split('/\s+/', trim($relAtt)); | |
} | |
if(in_array('canonical', $rels)) { | |
$url = $link->getAttribute('href'); | |
} | |
} | |
} | |
} | |
echo $url."\n"; | |
function load_html($html) { | |
$dom = new DOMDocument; | |
libxml_use_internal_errors(true); // suppress parse errors and warnings | |
// Force interpreting this as UTF-8 | |
@$dom->loadHTML('<?xml encoding="UTF-8">' . $html, LIBXML_NOWARNING|LIBXML_NOERROR); | |
libxml_clear_errors(); | |
return $dom; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment