When I do the scraping of the links, I get the errors 400/403/501 depending on the website (for example wordpress, digg)... But it works for some sites (for example iBlog)...
Probably something is fucked up with my curl code, but I can't see it...
Probably something is fucked up with my curl code, but I can't see it...
PHP:
<?php
$hostname = '{mail.domain.com:995/pop3/ssl/novalidate-cert}INBOX';
$username = 'confirm+domain.com';
$password = 'mypassword';
// connect...
$inbox = imap_open($hostname,$username,$password) or die(imap_last_error());
// get new emails
$emails = imap_search($inbox, 'NEW');
if($emails) {
foreach($emails as $email_number) { // for each mail...
$message = imap_fetchbody($inbox,$email_number,1); // get body from mail...
// remove line breaks
$data_with_no_breaks = preg_replace('/\n/si', ' ', $message);
$regex = '/http:(.+?) /si';
preg_match_all($regex, $data_with_no_breaks, $matches, PREG_PATTERN_ORDER);
$res = webFetcher($matches[0][0]);
echo $matches[0][0];
echo $res;
}
}
imap_close($inbox); // close the connection...
function getValue($item, $query, $end){
$item = stristr($item, $query);
$item = substr($item, strlen($query));
$stop = stripos($item, $end);
$val = substr($item, 0, $stop);
return $val;
}
function webFetcher($url) {
$agent = rnduseragent();
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_CONNECTTIMEOUT, 5);
curl_setopt($ch, CURLOPT_TIMEOUT, 10);
curl_setopt($ch, CURLOPT_REFERER, "http://sn105w.snt105.mail.live.com");
curl_setopt($ch, CURLOPT_USERAGENT, $agent);
curl_setopt($ch, CURLOPT_FOLLOWLOCATION, TRUE);
$result = curl_exec($ch);
curl_close($ch);
return $result;
}
function rnduseragent(){
$arr = array();
$arr[0] = "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-GB; rv:1.8.1.6) Gecko/20070725 Firefox/2.0.0.6";
$arr[1] = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1)";
$arr[2] = "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727; .NET CLR 3.0.04506.30)";
$arr[3] = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; .NET CLR 1.1.4322)";
$arr[4] = "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT 5.1; .NET CLR 1.1.4322)";
$arr[5] = "Opera/9.20 (Windows NT 6.0; U; en)";
$arr[6] = "Opera/9.00 (Windows NT 5.1; U; en)";
$arr[7] = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.50";
$arr[8] = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 8.0";
$arr[9] = "Mozilla/4.0 (compatible; MSIE 6.0; MSIE 5.5; Windows NT 5.1) Opera 7.02 [en]";
$nr = rand(0, 9);
$result = $arr[$nr];
return $result;
}
?>