1
0
Fork 0
feedizer-php/htdocs/scripts/crawler.php
2015-11-13 23:51:46 +01:00

115 lines
3.4 KiB
PHP

<?php
$starttime = new DateTime();
error_reporting(E_ALL & ~E_WARNING);
require(dirname(__FILE__) . '/../includes/init.inc.php');
// Truncate logfile
$maxloglen = 1000000; // = 1MB
$fs = filesize(CRAWLER_LOG_FILE);
if ($fs > $maxloglen) {
$fh = fopen(CRAWLER_LOG_FILE, 'r+');
fseek($fh, $fs - $maxloglen);
fgets($fh);
$buf = fread($fh, $maxloglen);
ftruncate($fh, 0);
rewind($fh);
fwrite($fh, $buf);
fclose($fh);
}
function logline($line) {
error_log(date('[Y-m-d H:i:s] ') . $line . "\n", 3, CRAWLER_LOG_FILE);
}
$feeds = feeds::getRefreshList();
if (empty($feeds)) {
logline('Nothing to update.');
logline($starttime->diff(new DateTime())->format('Execution time: %s seconds'));
exit(0);
}
$c = curl_init();
// duh.
curl_setopt($c, CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; FeedizerBot/' . CRAWLER_VERSION . '; +http://feedizer.tigris.fanir.de/page/bot)');
// return the transfer instead of print()ing
curl_setopt($c, CURLOPT_RETURNTRANSFER, true);
// 10 sec connection-timeout, 30 sec total timeout
curl_setopt($c, CURLOPT_CONNECTTIMEOUT, 10);
curl_setopt($c, CURLOPT_TIMEOUT, 30);
// Abort if slower than 1 kbyte/s for more than 10 seconds
curl_setopt($c, CURLOPT_LOW_SPEED_LIMIT, 1000);
curl_setopt($c, CURLOPT_LOW_SPEED_TIME, 10);
// max average recieve speed of 2 Mbyte/s
curl_setopt($c, CURLOPT_MAX_RECV_SPEED_LARGE, 2000000);
// Follow max. 2 redirects (Location-header)
curl_setopt($c, CURLOPT_FOLLOWLOCATION, true);
curl_setopt($c, CURLOPT_MAXREDIRS, 2);
function fetch(&$val, $key) {
global $c;
curl_setopt($c, CURLOPT_URL, $val['uri']);
$val['new_html'] = curl_exec($c);
}
array_walk($feeds, 'fetch');
foreach ($feeds as $feed) {
switch (feedItems::newItem($feed['id'], $feed['new_html'])) {
case 0:
logline($feed['slug'] . "\tupdated");
break;
case 1:
logline($feed['slug'] . "\tunchanged");
break;
default:
logline($feed['slug'] . "\tFAILED!");
}
feeds::updateNextRefresh($feed['id']);
}
/* #VAR2
$c = array();
$cm = curl_multi_init();
$cm_running = null;
$cnt = 0;
foreach ($feeds as $i => $feed) {
$c[$i] = curl_init($feed['uri']);
// duh.
curl_setopt($c[$i], CURLOPT_USERAGENT, 'Mozilla/5.0 (compatible; FeedizerBot/' . CRAWLER_VERSION . '; +http://feedizer.tigris.fanir.de/page/bot)');
// return the transfer instead of print()ing
curl_setopt($c[$i], CURLOPT_RETURNTRANSFER, true);
// 10 sec connection-timeout, 30 sec total timeout
curl_setopt($c[$i], CURLOPT_CONNECTTIMEOUT, 10);
curl_setopt($c[$i], CURLOPT_TIMEOUT, 30);
// Abort if slower than 1 kbyte/s for more than 10 seconds
curl_setopt($c[$i], CURLOPT_LOW_SPEED_LIMIT, 1000);
curl_setopt($c[$i], CURLOPT_LOW_SPEED_TIME, 10);
// max average recieve speed of 2 Mbyte/s
curl_setopt($c[$i], CURLOPT_MAX_RECV_SPEED_LARGE, 2000000);
// Follow max. 2 redirects (Location-header)
curl_setopt($c[$i], CURLOPT_FOLLOWLOCATION, true);
curl_setopt($c[$i], CURLOPT_MAXREDIRS, 2);
curl_multi_add_handle($cm, $c[$i]);
}
do {
curl_multi_exec($cm, $cm_running);
curl_multi_select($cm);
do {
$cm_info = curl_multi_info_read($cm, $cm_queuelen);
var_dump($cm_info);
} while ($cm_queuelen > 0);
} while ($cm_running > 0);
*/
logline('Execution time: ' . $starttime->diff(new DateTime())->format('%s'));