A => beaubot/bot/proxy/AddressChecker.pm +82 -0
@@ 0,0 1,82 @@
+package AddressChecker;
+
+use strict;
+use warnings;
+use Socket;
+use Log::Any;
+
+my $logger = Log::Any::get_logger;
+
+my @hostnames = (
+ {
+ 'pattern' => qr/(?:ppp|adsl)\.tpnet\.pl$/,
+ 'type' => 'dynamic',
+ 'block' => '12 hours',
+ },
+ {
+ 'pattern' => qr/^(?:dial|dynamic)-.+\.dialog\.net\.pl$/,
+ 'type' => 'dynamic',
+ },
+ {
+ 'pattern' => qr/\.dynamic\.t-mobile\.pl$/,
+ 'type' => 'dynamic',
+ },
+ {
+ 'pattern' => qr/^user-\d+-\d+-\d+-\d+\.play-internet\.pl$/,
+ 'type' => 'dynamic',
+ },
+ {
+ 'pattern' => qr/\.adsl\.inetia\.pl$/,
+ 'type' => 'dynamic',
+ 'block' => '12 hours',
+ },
+ {
+ 'pattern' => qr/\.gprs.*\.plus(?:gsm)?\.pl$/,
+ 'type' => 'dynamic',
+ },
+ {
+ 'pattern' => qr/-gprs.+\.centertel\.pl$/,
+ 'type' => 'dynamic',
+ },
+ {
+ 'pattern' => qr/dynamic\.chello\.pl$/,
+ 'type' => 'dynamic',
+ 'block' => '4 months',
+ },
+ {
+ 'pattern' => qr/dynamic/,
+ 'type' => 'dynamic',
+ 'block' => '2 months',
+ },
+);
+
+sub check {
+ my $address = shift;
+ $logger->debug("Checking $address");
+ my $result = {};
+ my $info;
+ my $hostname = gethostbyaddr( inet_aton($address), AF_INET );
+
+ if ( defined $hostname ) {
+ $result->{hostname} = $hostname;
+ $logger->debug("The address $address resolves to $hostname");
+ foreach my $entry (@hostnames) {
+ if ( $hostname =~ $entry->{pattern} ) {
+ $info = $entry;
+ last;
+ }
+ }
+ }
+
+ if ( defined $info ) {
+ $result->{type} = $info->{type};
+ $result->{block} = $info->{block};
+ }
+ else {
+ $result->{type} = 'static';
+ $result->{block} = '6 months';
+ }
+ return $result;
+}
+
+1;
A => beaubot/bot/proxy/ProxyDatabase.pm +225 -0
@@ 0,0 1,225 @@
+package ProxyDatabase;
+
+use strict;
+use warnings;
+use Log::Any;
+use DBI;
+use DateTime;
+use DateTime::Format::Strptime;
+
+my $logger = Log::Any->get_logger;
+
+my $dateFormat = new DateTime::Format::Strptime(
+ pattern => '%Y-%m-%dT%TZ',
+ time_zone => 'UTC',
+ on_error => 'croak',
+);
+
+sub new {
+ my $caller = shift;
+ my $class = ref($caller) || $caller;
+
+ my $this = {
+ 'dbh' => undef,
+ 'file' => undef,
+ @_,
+ };
+ bless $this, $class;
+
+ die "A path to a database is missing\n"
+ unless defined $this->{file};
+
+ $this->{dbh} = DBI->connect( "dbi:SQLite:dbname=$this->{file}", "", "", { RaiseError => 1, PrintError => 0, sqlite_use_immediate_transaction => 1 } );
+
+ return $this;
+}
+
+# ------------------------------------------------------------------------------
+
+sub begin {
+ my $this = shift;
+ $this->{dbh}->begin_work;
+}
+
+sub commit {
+ my $this = shift;
+ $this->{dbh}->commit;
+}
+
+# ------------------------------------------------------------------------------
+
+sub getSetting {
+ my $this = shift;
+ my $name = shift;
+
+ my $sth = $this->{dbh}->prepare_cached('SELECT value FROM settings WHERE name = ?');
+ $sth->execute($name);
+ my ($value) = $sth->fetchrow_array();
+ $sth->finish;
+ return $value;
+}
+
+sub setSetting {
+ my $this = shift;
+ my $name = shift;
+ my $value = shift;
+
+ my $sth = $this->{dbh}->prepare_cached('INSERT OR REPLACE INTO settings(name, value) VALUES(?, ?)');
+ $sth->execute( $name, $value );
+}
+
+# ------------------------------------------------------------------------------
+
+sub insertOrIgnoreProxy {
+ my $this = shift;
+ my $address = shift;
+ my $source = shift;
+
+ my $sth = $this->{dbh}->prepare_cached('INSERT OR IGNORE INTO proxies (proxy_address, proxy_added, proxy_source) VALUES (?, ?, ?)');
+ $sth->execute( $address, $dateFormat->format_datetime( DateTime->now ), $source );
+}
+
+sub fetchProxiesToCheck {
+ my $this = shift;
+ my $since = shift;
+ my $limit = shift || 50;
+
+ my $sth = $this->{dbh}->prepare_cached( "
+ SELECT proxy_id, proxy_address
+ FROM proxies
+ WHERE proxy_checked IS NULL OR proxy_checked < ?
+ ORDER BY proxy_checked, proxy_id DESC
+ LIMIT $limit
+" );
+ $sth->execute( $dateFormat->format_datetime($since) );
+
+ my @list;
+
+ while ( my $row = $sth->fetchrow_hashref ) {
+ push @list, $row;
+ }
+ return @list;
+}
+
+# ------------------------------------------------------------------------------
+
+sub insertCheckResult {
+ my $this = shift;
+ my $proxy = shift;
+ my $status = shift;
+ my $type = shift;
+ my $address = shift;
+
+ my $timestamp = $dateFormat->format_datetime( DateTime->now );
+ my $insertSth = $this->{dbh}->prepare_cached('INSERT INTO checks (check_proxy, check_timestamp, check_status, check_type, check_address) VALUES (?, ?, ?, ?, ?)');
+ $insertSth->execute( $proxy, $timestamp, $status, $type, $address );
+ my $updateSth = $this->{dbh}->prepare_cached('UPDATE proxies SET proxy_checked = ? WHERE proxy_id = ?');
+ $updateSth->execute( $timestamp, $proxy );
+}
+
+# ------------------------------------------------------------------------------
+
+sub insertOrIgnoreBlock {
+ my $this = shift;
+ my $target = shift;
+ my $start = shift;
+ my $expiry = shift;
+
+ my $sth = $this->{dbh}->prepare_cached('INSERT OR IGNORE INTO blocks (block_address, block_start, block_expiry) VALUES (?, ?, ?)');
+ $sth->execute( $target, $dateFormat->format_datetime($start), $expiry );
+}
+
+sub removeBlock {
+ my $this = shift;
+ my $blockId = shift;
+
+ my $sth = $this->{dbh}->prepare_cached('DELETE FROM blocks WHERE block_id = ?');
+ $sth->execute($blockId);
+}
+
+sub fetchBlocks {
+ my $this = shift;
+ my $blockId = shift;
+ my $limit = shift || 50;
+
+ my $sth = $this->{dbh}->prepare_cached("SELECT block_id AS id, block_address AS address, block_start AS start, block_expiry AS expiry FROM blocks ORDER BY block_id ASC LIMIT $limit");
+ $sth->execute;
+
+ my @list;
+ while ( my $row = $sth->fetchrow_hashref ) {
+ $row->{start} = $dateFormat->parse_datetime( $row->{start} );
+ push @list, $row;
+ }
+ return @list;
+}
+
+# ------------------------------------------------------------------------------
+
+my @sessionChars = ( 'A' .. 'Z', 'a' .. 'z', '0' .. '9' );
+
+sub generateSessionId {
+ my $result = '';
+
+ for ( my $i = 0 ; $i < 32 ; $i++ ) {
+ $result .= $sessionChars[ int( rand( scalar @sessionChars ) ) ];
+ }
+ return $result;
+}
+
+sub createSession {
+ my $this = shift;
+
+ my $sth = $this->{dbh}->prepare_cached('INSERT INTO sessions (session_id) VALUES (?)');
+ my $count = 0;
+ while ( $count < 100 ) {
+ my $sessionId = generateSessionId();
+ eval { #
+ $sth->execute($sessionId);
+ };
+ my $error = $@;
+ if ($error) {
+ die $error
+ unless $error =~ /column session_id is not unique/;
+ }
+ else {
+ return $sessionId;
+ }
+ $count++;
+ }
+ die "Unable to generate sessionId\n";
+}
+
+sub destroySession {
+ my $this = shift;
+ my $sessionId = shift;
+
+ my $sth = $this->{dbh}->prepare_cached('DELETE FROM sessions WHERE session_id = ?');
+ $sth->execute($sessionId);
+}
+
+sub getSessionAddress {
+ my $this = shift;
+ my $sessionId = shift;
+
+ my $sth = $this->{dbh}->prepare_cached('SELECT session_address FROM sessions WHERE session_id = ?');
+
+ $sth->execute($sessionId);
+ my ($address) = $sth->fetchrow_array;
+ $sth->finish;
+ return $address;
+}
+
+sub setSessionAddress {
+ my $this = shift;
+ my $sessionId = shift;
+ my $address = shift;
+
+ my $sth = $this->{dbh}->prepare_cached('UPDATE sessions SET session_address = ? WHERE session_id = ? AND session_address IS NULL');
+ $sth->execute( $address, $sessionId );
+
+ return $sth->rows;
+}
+
+# ------------------------------------------------------------------------------
+
+1;
A => beaubot/bot/proxy/check-ip.pl +36 -0
@@ 0,0 1,36 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Data::Dumper;
+use FindBin qw($RealBin);
+use lib "$RealBin/..";
+use Bot4;
+use ProxyDatabase;
+use DateTime;
+use AddressChecker;
+
+my $db = "$RealBin/../var/proxy.sqlite";
+my $bot = new Bot4;
+
+$bot->single(1);
+$bot->addOption( "database=s", \$db, "Changes path to a database" );
+$bot->setup( 'root' => "$RealBin/.." );
+
+my $logger = Log::Any::get_logger;
+$logger->info("Start");
+
+my $dbh = ProxyDatabase->new( 'file' => $db );
+
+my $sth = $dbh->{dbh}->prepare_cached('SELECT * FROM checks WHERE check_status <> 0');
+$sth->execute;
+while ( my $row = $sth->fetchrow_hashref ) {
+ my $result = AddressChecker::check( $row->{check_address} );
+ if ( $logger->is_debug ) {
+ $logger->debug( "Result:\n" . Dumper( [ $row, $result ] ) );
+ }
+ if ( defined $result->{block} ) {
+ $logger->info("Placing block on $row->{check_address} with expiry: $result->{block}");
+ $dbh->insertOrIgnoreBlock( $row->{check_address}, DateTime->now(), $result->{block} );
+ }
+
+}
A => beaubot/bot/proxy/log4perl.conf +10 -0
@@ 0,0 1,10 @@
+log4perl.oneMessagePerAppender = 1
+
+log4perl.rootLogger=INFO, console
+
+log4perl.category.main = DEBUG, console
+
+log4perl.appender.console=Log::Log4perl::Appender::Screen
+log4perl.appender.console.layout=Log::Log4perl::Layout::PatternLayout
+log4perl.appender.console.layout.ConversionPattern=[%d{dd-MM-yyyy HH:mm:ss}][%P] %p %m%n
+log4perl.appender.console.utf8=1
A => beaubot/bot/proxy/proxy-block.pl +73 -0
@@ 0,0 1,73 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Data::Dumper;
+use FindBin qw($RealBin);
+use lib "$RealBin/..";
+use Bot4;
+use ProxyDatabase;
+
+my $db = "$RealBin/../var/proxy.sqlite";
+my $bot = new Bot4;
+
+$bot->single(1);
+$bot->addOption( "database=s", \$db, "Changes path to a database" );
+$bot->setProject( 'wikipedia', 'pl', 'sysop' );
+$bot->setup( 'root' => "$RealBin/.." );
+
+my $logger = Log::Any::get_logger;
+$logger->info("Start");
+
+my $dbh = ProxyDatabase->new( 'file' => $db );
+
+my $api = $bot->getApi;
+$api->checkAccount;
+
+while ( my @list = $dbh->fetchBlocks ) {
+ foreach my $block (@list) {
+ $logger->info("Processing block request of $block->{address}");
+
+ my $blocked = 0;
+ {
+ my $iterator = $api->getIterator(
+ 'action' => 'query',
+ 'list' => 'blocks',
+ 'bkip' => $block->{address},
+ );
+
+ while ( my $entry = $iterator->next ) {
+ $blocked++;
+ $logger->info( "Active blocks of $block->{address}:\n" . Dumper($entry) )
+ if $logger->is_info;
+ }
+ }
+ {
+ my $iterator = $api->getIterator(
+ 'action' => 'query',
+ 'list' => 'globalblocks',
+ 'bgip' => $block->{address},
+ );
+
+ while ( my $entry = $iterator->next ) {
+ $blocked++;
+ $logger->info( "Active global blocks of $block->{address}:\n" . Dumper($entry) )
+ if $logger->is_info;
+ }
+ }
+ if ($blocked) {
+ $logger->info("$block->{address} is already blocked");
+ }
+ else {
+ $logger->info("Blocking $block->{address} with expiry $block->{expiry}");
+ $api->block(
+ 'anononly' => 1,
+ 'nocreate' => 1,
+ 'user' => $block->{address},
+ 'reason' => '[[WP:OP|open proxy]]',
+ 'expiry' => $block->{expiry},
+ );
+
+ }
+ $dbh->removeBlock( $block->{id} );
+ }
+}
A => beaubot/bot/proxy/proxy-check-verifier.pl +29 -0
@@ 0,0 1,29 @@
+#!/usr/bin/perl
+
+use strict;
+use utf8;
+use CGI;
+use ProxyDatabase;
+use Env;
+use FindBin qw($RealBin);
+
+binmode STDOUT, ":utf8";
+
+my $db = "$RealBin/../var/proxy.sqlite";
+my $sessionId = CGI::param('id');
+my $remoteAddr = $ENV{REMOTE_ADDR};
+
+sub error {
+ print "Content-type: text/html; charset=utf-8\n\nERROR";
+ exit(0);
+}
+
+error()
+ unless defined $sessionId and defined $remoteAddr;
+
+my $dbh = ProxyDatabase->new( 'file' => $db );
+
+error()
+ unless $dbh->setSessionAddress( $sessionId, $remoteAddr );
+
+print "Content-type: text/html; charset=utf-8\n\nSUCCESS";
A => beaubot/bot/proxy/proxy-check-worker.pl +298 -0
@@ 0,0 1,298 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Data::Dumper;
+use FindBin qw($RealBin);
+use ProxyDatabase;
+use AddressChecker;
+use LWP::UserAgent;
+use WWW::Mechanize;
+use IO::Handle;
+use Storable qw(freeze thaw);
+use File::Spec;
+use Log::Any;
+use Log::Any::Adapter;
+use LWP::Protocol::socks;
+
+binmode STDIN;
+binmode STDOUT;
+STDOUT->autoflush(1);
+
+my $db = "$RealBin/../var/proxy.sqlite";
+my $testUrl = 'http://94.23.242.48/~beau/cgi-bin/verify.pl?id=$sessionId';
+my $agent = 'Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.9.0.7) Gecko/2009021910 Firefox/3.0.7';
+
+my %ignoredAddresses = map { $_ => 1 } qw(94.23.242.48 178.33.53.23 178.32.200.207 127.0.0.1);
+
+# Send all logs to Log::Log4perl
+use Log::Log4perl;
+Log::Log4perl->init( File::Spec->join( $RealBin, 'log4perl.conf' ) );
+Log::Any::Adapter->set('Log4perl');
+
+my $logger = Log::Any::get_logger;
+$logger->info("Worker is being started");
+
+my $dbh = ProxyDatabase->new( 'file' => $db );
+
+sub checkProxyLwp {
+ my $info = shift;
+ my $proxy = shift;
+
+ $logger->info("Checking proxy '$proxy' using LWP::UserAgent");
+ my $result = {
+ 'address' => undef,
+ 'status' => 0,
+ };
+ my $sessionId = $dbh->createSession();
+ eval {
+ my $ua = LWP::UserAgent->new(
+ 'agent' => $agent,
+ 'timeout' => 30,
+ );
+ $ua->proxy( [ 'http', 'https' ] => $proxy );
+
+ my $url = $testUrl;
+ $url =~ s/\$sessionId/$sessionId/g;
+
+ $ua->get($url);
+ my $address = $dbh->getSessionAddress($sessionId);
+
+ $result->{address} = $address;
+ $result->{status} = defined $address ? 1 : 0;
+ };
+ my $error = $@;
+ $dbh->destroySession($sessionId);
+ die $error
+ if $error;
+
+ return $result;
+}
+
+sub checkHttpProxy {
+ my $info = shift;
+
+ # Do not check proxies without a specified port
+ return { status => 0 }
+ unless defined $info->{port};
+
+ my $result = checkProxyLwp( $info, "http://$info->{address}:$info->{port}/" );
+ $result->{type} = 'http';
+ $result->{accepted} = '1';
+ return $result;
+}
+
+sub checkSocks4Proxy {
+ my $info = shift;
+
+ # Do not check proxies without a specified port
+ return { status => 0 }
+ unless defined $info->{port};
+
+ my $result = checkProxyLwp( $info, "socks4://$info->{address}:$info->{port}/" );
+ $result->{type} = 'socks4';
+ $result->{accepted} = '1';
+ return $result;
+}
+
+sub checkSocks5Proxy {
+ my $info = shift;
+
+ # Do not check proxies without a specified port
+ return { status => 0 }
+ unless defined $info->{port};
+
+ my $result = checkProxyLwp( $info, "socks://$info->{address}:$info->{port}/" );
+ $result->{type} = 'socks5';
+ $result->{accepted} = '1';
+ return $result;
+}
+
+sub checkWebProxy {
+ my $info = shift;
+
+ # Do not check proxies with a specified port
+ return { status => 0 }
+ if defined $info->{port};
+
+ # Do not check non-web proxies
+ return { status => 0 }
+ unless $info->{address} =~ m{^https?://}i;
+
+ $logger->info("Checking proxy '$info->{address}' using WWW::Mechanize");
+ my $result = {
+ 'address' => undef,
+ 'status' => 0,
+ };
+ my $sessionId = $dbh->createSession();
+ eval {
+ my $mech = new WWW::Mechanize(
+ 'agent' => $agent,
+ 'timeout' => 30,
+ 'autocheck' => 0,
+ 'quiet' => 1,
+ );
+
+ my $response = $mech->get( $info->{address} );
+ return unless $response->is_success;
+
+ my $form;
+ my $field;
+
+ foreach my $f ( 'u', 'q', 'url', 'rxproxyuri' ) {
+ $form = $mech->form_with_fields($f);
+ $field = $f;
+ next if $form and $form->method eq 'SEND';
+ last if $form;
+ }
+
+ unless ( defined $form ) {
+ $logger->debug("Unable to find form");
+ return;
+ }
+ my $url = $testUrl;
+ $url =~ s/\$sessionId/$sessionId/g;
+ $mech->field( $field, $url );
+ $mech->submit;
+
+ my $address = $dbh->getSessionAddress($sessionId);
+
+ $result->{address} = $address;
+ $result->{status} = defined $address ? 1 : 0;
+ $result->{type} = 'web proxy';
+ };
+ my $error = $@;
+ $dbh->destroySession($sessionId);
+ die $error
+ if $error;
+
+ $result->{accepted} = '1';
+ return $result;
+}
+
+my @methods = ( #
+ \&checkHttpProxy,
+ \&checkSocks4Proxy,
+ \&checkSocks5Proxy,
+ \&checkWebProxy,
+);
+
+sub checkProxy {
+ my $info = shift;
+
+ foreach my $method (@methods) {
+
+ # TODO: Check till no new addresses are discovered...
+ my $result;
+ for ( my $i = 0 ; $i < 3 ; $i++ ) {
+ $result = $method->($info);
+ last if $result->{status};
+ sleep(1)
+ if $result->{accepted};
+ }
+ return $result
+ if $result->{status};
+ }
+ return { status => 0 };
+}
+
+# ------------------------------------------------------------------------------
+
+sub sendMessage {
+ my $message = shift;
+
+ if ( $logger->is_debug ) {
+ $logger->debug( "Sending a message:\n" . Dumper($message) );
+ }
+
+ my $data = freeze($message);
+ STDOUT->print( bytes::length($data) . "\n" . $data )
+ or die "Unable to send a message: $!\n";
+}
+
+sub readMessage {
+ my $size = readline(STDIN);
+
+ die "Unable to read a message header: $!\n"
+ unless defined $size;
+
+ chomp($size);
+
+ my $buffer;
+ my $len = read( STDIN, $buffer, $size );
+
+ if ( $len < $size ) {
+ die "Received a corrupted message: expected $len bytes instead of $size bytes\n";
+ }
+ my $message = thaw $buffer;
+ if ( $logger->is_debug ) {
+ $logger->debug( "Received a message:\n" . Dumper($message) );
+ }
+ return $message;
+}
+
+# ------------------------------------------------------------------------------
+
+eval {
+ $logger->info("Worker is entering main loop");
+ while (1) {
+ $0 = 'proxy-check-worker: idle';
+ last
+ if eof(STDIN);
+ my $message = readMessage();
+
+ die "Invalid message\n"
+ unless ref($message) eq 'HASH' and defined $message->{command};
+
+ if ( $message->{command} eq 'CHECK' ) {
+ $0 = "proxy-check-worker: checking $message->{address}";
+
+ if ( $message->{address} =~ /^(.+?):(\d+)$/ ) {
+ $message->{address} = $1;
+ $message->{port} = $2;
+ }
+
+ my $result = checkProxy($message);
+
+ if ( $result->{status} ) {
+
+ my $reject = undef;
+
+ if ( $result->{address} !~ /^\d+\.\d+\.\d+\.\d+$/ ) {
+ $reject = "not a valid IPv4 address '$result->{address}'";
+ }
+ elsif ( $ignoredAddresses{ $result->{address} } ) {
+ $reject = "ignored address '$result->{address}'";
+ }
+
+ if ( defined $reject ) {
+ $logger->info("Ignoring proxy $message->{proxy}: $reject");
+ $result = { status => 0 };
+ }
+ else {
+
+ # Check type of address (isp, country, service)
+ my $addressCheck = AddressChecker::check( $result->{address} );
+ $result->{block} = $addressCheck->{block};
+ $result->{hostname} = $addressCheck->{hostname};
+ $result->{addressType} = $addressCheck->{type};
+ }
+ }
+
+ $result->{proxy} = $message->{proxy};
+ $result->{command} = 'CHECKRESULT';
+ sendMessage($result);
+ }
+ elsif ( $message->{command} eq 'QUIT' ) {
+ last;
+ }
+ else {
+ die "Unsupported command '$message->{command}'\n";
+ }
+ }
+ $logger->info("Worker is being terminated");
+};
+if ($@) {
+ $@ =~ s/\s+$//;
+ $logger->fatal("Worker is being terminated: $@");
+ exit(1);
+}
A => beaubot/bot/proxy/proxy-check.pl +252 -0
@@ 0,0 1,252 @@
+#!/usr/bin/perl -w
+
+use strict;
+use LWP::UserAgent;
+use Data::Dumper;
+use FindBin qw($RealBin);
+use lib "$RealBin/..";
+use Storable qw(freeze thaw);
+use Bot4;
+use AnyEvent;
+use IPC::Open2;
+use ProxyDatabase;
+use Time::HiRes qw(clock_gettime CLOCK_MONOTONIC);
+
+my $maxWorkers = 10;
+
+my $db = "$RealBin/../var/proxy.sqlite";
+my $bot = new Bot4;
+$bot->single(1);
+$bot->addOption( "database=s", \$db, "Changes path to a database" );
+$bot->setup( 'root' => "$RealBin/.." );
+
+my $logger = Log::Any::get_logger;
+$logger->info("Start");
+
+my $dbh = ProxyDatabase->new( 'file' => $db );
+
+my %workers; # pid => $worker
+my %jobsTaken; # proxy_id => $worker
+my @jobsQueue; # list of { proxy_id, proxy_address }
+my $lastJobsFetched = clock_gettime(CLOCK_MONOTONIC) - 60; # timestamp, when last jobs were fetched
+
+sub sendMessage {
+ my $child = shift;
+ my $message = shift;
+
+ if ( $logger->is_debug ) {
+ $logger->debug( "Sending a message to a worker $child->{pid}:\n" . Dumper($message) );
+ }
+
+ my $data = freeze($message);
+ $child->{in}->print( bytes::length($data) . "\n" . $data )
+ or die "Unable to send a message to a worker $child->{pid}: $!\n";
+}
+
+sub readMessage {
+ my $child = shift;
+
+ my $size = readline( $child->{out} );
+
+ die "Unable to read from a worker $child->{pid}: $!\n"
+ unless defined $size;
+
+ chomp($size);
+
+ my $buffer;
+ my $len = read( $child->{out}, $buffer, $size );
+
+ if ( $len < $size ) {
+ die "Received a corrupted message from a worker $child->{pid}: expected $len bytes instead of $size bytes\n";
+ }
+ my $message = thaw $buffer;
+ if ( $logger->is_debug ) {
+ $logger->debug( "Received a message from a worker $child->{pid}:\n" . Dumper($message) );
+ }
+ return $message;
+}
+
+sub killWorker {
+ my $child = shift;
+ $logger->info("Killing worker $child->{pid}");
+ unassignJob($child);
+ undef $child->{childWatcher};
+ undef $child->{ioWatcher};
+ kill 'KILL', $child->{pid}
+ unless defined $child->{dead};
+ close( $child->{in} );
+ close( $child->{out} );
+ delete $workers{ $child->{pid} };
+}
+
+sub spawnWorker {
+ my $child = {
+ 'pid' => undef,
+ 'in' => undef,
+ 'out' => undef,
+ };
+ $child->{pid} = open2( $child->{out}, $child->{in}, 'perl', 'proxy-check-worker.pl' );
+ binmode $child->{out}
+ if $child->{out};
+ binmode $child->{in}
+ if $child->{in};
+
+ $child->{childWatcher} = AnyEvent->child(
+ pid => $child->{pid},
+ cb => sub {
+ my ( $pid, $status ) = @_;
+ $logger->info("Worker $child->{pid} has exited with status: $status");
+ $child->{dead} = $status;
+ killWorker($child);
+ },
+ );
+
+ $child->{ioWatcher} = AnyEvent->io(
+ 'fh' => $child->{out},
+ 'poll' => 'r',
+ 'cb' => sub {
+ eval {
+ my $message = readMessage($child);
+ processMessage( $child, $message );
+ };
+ if ($@) {
+ $@ =~ s/\s+$//;
+ $logger->error($@);
+ killWorker($child);
+ return;
+ }
+ }
+ );
+
+ $workers{ $child->{pid} } = $child;
+ return $child;
+}
+
+sub fetchJobs {
+ my $time = clock_gettime(CLOCK_MONOTONIC);
+ return if $time - $lastJobsFetched < 59;
+ $logger->debug("Fetching proxies to check");
+ my $dt = DateTime->now;
+ $dt -= DateTime::Duration->new( 'months' => 1 );
+ @jobsQueue = grep { !exists $jobsTaken{ $_->{proxy_id} } } $dbh->fetchProxiesToCheck( $dt, 200 );
+ $lastJobsFetched = $time;
+}
+
+sub sendJob {
+ my $worker = shift;
+
+ fetchJobs unless @jobsQueue;
+ my $job = shift @jobsQueue;
+ return unless $job;
+
+ eval {
+ sendMessage(
+ $worker,
+ {
+ 'command' => 'CHECK',
+ 'proxy' => $job->{proxy_id},
+ 'address' => $job->{proxy_address},
+ }
+ );
+
+ $worker->{job} = $job;
+ $worker->{jobStart} = clock_gettime(CLOCK_MONOTONIC);
+
+ $jobsTaken{ $job->{proxy_id} } = $worker;
+ };
+ my $error = $@;
+ if ($error) {
+ unshift @jobsQueue, $job;
+ $logger->error("Unable to send a job to a worker $worker->{pid}: $error");
+ killWorker($worker);
+ }
+}
+
+sub unassignJob {
+ my $worker = shift;
+
+ my $job = $worker->{job};
+ return unless defined $job;
+
+ $worker->{job} = undef;
+ $worker->{jobTime} = undef;
+ delete $jobsTaken{ $job->{proxy_id} };
+}
+
+sub processMessage {
+ my $worker = shift;
+ my $message = shift;
+
+ die "Invalid message\n"
+ unless ref($message) eq 'HASH' and defined $message->{command};
+
+ if ( $message->{command} eq 'CHECKRESULT' ) {
+ if ( $message->{proxy} != $worker->{job}->{proxy_id} ) {
+ die "Invalid job id, got: $message->{proxy}, expected: $worker->{job}->{proxy_id}\n";
+ }
+ unassignJob($worker);
+
+ $dbh->insertCheckResult( @{$message}{ 'proxy', 'status', 'type', 'address' } );
+
+ # If the address qualifies for blocking append block request to the queue
+ if ( defined $message->{block} ) {
+ $dbh->insertOrIgnoreBlock( $message->{address}, DateTime->now(), $message->{block} );
+ }
+ sendJob($worker);
+ }
+ else {
+ die "Unsupported command '$message->{command}'\n";
+ }
+}
+
+sub min ($$) {
+ return $_[ $_[0] > $_[1] ];
+}
+
+my $quit = AnyEvent->condvar;
+
+my $jobs = AnyEvent->timer(
+ after => 0,
+ interval => 60,
+ cb => sub {
+ unless (@jobsQueue) {
+
+ # Empty job queue, fetch more
+ fetchJobs;
+ }
+
+ # Spawn workers if needed
+ my $currentWorkers = scalar keys %workers;
+ my $workersToSpawn = min( $maxWorkers - $currentWorkers, scalar @jobsQueue );
+
+ if ( $workersToSpawn > 0 ) {
+ $logger->info("Spawning workers: $workersToSpawn");
+ for ( my $i = 0 ; $i < $workersToSpawn ; $i++ ) {
+ spawnWorker;
+ }
+ }
+ my $time = clock_gettime(CLOCK_MONOTONIC);
+ foreach my $worker ( values %workers ) {
+ if ( $worker->{job} ) {
+
+ # Check job time
+ if ( $time - $worker->{jobStart} > 3600 ) {
+ killWorker($worker);
+ }
+ }
+ else {
+
+ # Assign job to worker
+ sendJob($worker);
+ }
+ }
+ },
+);
+
+eval { #
+ $quit->recv;
+};
+if ($@) {
+ $@ =~ s/\s+$//;
+ $logger->fatal($@);
+}
A => beaubot/bot/proxy/proxy-import-centrump2p.pl +57 -0
@@ 0,0 1,57 @@
+#!/usr/bin/perl -w
+
+use strict;
+use FindBin qw($RealBin);
+use lib "$RealBin/..";
+use Bot4;
+use ProxyDatabase;
+use WWW::Mechanize;
+use HTML::Entities qw(decode_entities);
+
+my $db = "$RealBin/../var/proxy.sqlite";
+my $bot = new Bot4;
+
+$bot->single(1);
+$bot->addOption( "database=s", \$db, "Changes path to a database" );
+$bot->setup( 'root' => "$RealBin/.." );
+
+my $logger = Log::Any::get_logger;
+$logger->info("Start");
+
+my $dbh = ProxyDatabase->new( 'file' => $db );
+my $mech = new WWW::Mechanize;
+$mech->agent_alias('Windows Mozilla');
+
+$mech->get('http://prx.centrump2p.com/');
+
+sub extractLinks {
+ my @links = $mech->content =~ /<td class="a1"><a href="([^"]+)/g;
+ die "Unable to find links\n"
+ unless @links;
+
+ $dbh->begin;
+ foreach my $link (@links) {
+ $link = decode_entities($link);
+ $logger->info("Enqueueing $link for scan");
+ $dbh->insertOrIgnoreProxy( $link, $mech->uri->as_string );
+
+ }
+ $dbh->commit;
+}
+
+die "Unable to find pager\n"
+ unless $mech->content =~ m{<div id="pager">.+<a href="/(\d+)">\d+</a>\s*</div>}s;
+
+my $count = $1;
+
+die "Number of pages is too high ($count)\n"
+ if $count > 100;
+
+extractLinks;
+
+for ( my $i = 2 ; $i < $count ; $i++ ) {
+ sleep( int( rand(10) ) + 5 );
+ $logger->info("Checking page $i of $count");
+ $mech->get("http://prx.centrump2p.com/$i");
+ extractLinks;
+}
A => beaubot/bot/proxy/proxy-import-hinky.pl +57 -0
@@ 0,0 1,57 @@
+#!/usr/bin/perl -w
+
+use strict;
+use FindBin qw($RealBin);
+use lib "$RealBin/..";
+use Bot4;
+use ProxyDatabase;
+use WWW::Mechanize;
+use HTML::Entities qw(decode_entities);
+
+my $db = "$RealBin/../var/proxy.sqlite";
+my $bot = new Bot4;
+
+$bot->single(1);
+$bot->addOption( "database=s", \$db, "Changes path to a database" );
+$bot->setup( 'root' => "$RealBin/.." );
+
+my $logger = Log::Any::get_logger;
+$logger->info("Start");
+
+my $dbh = ProxyDatabase->new( 'file' => $db );
+my $mech = new WWW::Mechanize;
+$mech->agent_alias('Windows Mozilla');
+
+$mech->get('http://mrhinkydink.com/proxies.htm');
+
+sub extractLinks {
+ my @links = $mech->content =~ m{<td>(\d+\.\d+\.\d+\.\d+)(?:<sup>\*</sup>)?</td>\s*<td>(\d+)</td>}g;
+ die "Unable to find links\n"
+ unless @links;
+
+ $dbh->begin;
+ while (@links) {
+ my ( $address, $port ) = splice @links, 0, 2;
+ my $link = "$address:$port";
+ $logger->info("Enqueueing $link for scan");
+ $dbh->insertOrIgnoreProxy( $link, $mech->uri->as_string );
+ }
+ $dbh->commit;
+}
+
+die "Unable to find pager\n"
+ unless $mech->content =~ m{<a href="proxies(\d+)\.htm">\[\d+\]</a> \s*</td>}s;
+
+my $count = $1;
+
+die "Number of pages is too high ($count)\n"
+ if $count > 100;
+
+extractLinks;
+
+for ( my $i = 2 ; $i < $count ; $i++ ) {
+ sleep( int( rand(10) ) + 5 );
+ $logger->info("Checking page $i of $count");
+ $mech->get("http://mrhinkydink.com/proxies$i.htm");
+ extractLinks;
+}
A => beaubot/bot/proxy/proxy-import-procseebot.pl +88 -0
@@ 0,0 1,88 @@
+#!/usr/bin/perl -w
+
+use strict;
+use Data::Dumper;
+use DateTime;
+use DateTime::Format::Strptime;
+use FindBin qw($RealBin);
+use lib "$RealBin/..";
+use Bot4;
+use ProxyDatabase;
+
+my $db = "$RealBin/../var/proxy.sqlite";
+my $user = 'ProcseeBot';
+my $bot = new Bot4;
+
+$bot->single(1);
+$bot->addOption( "database=s", \$db, "Changes path to a database" );
+$bot->setProject( 'wikipedia', 'en' );
+$bot->setup( 'root' => "$RealBin/.." );
+
+my $dateFormat = new DateTime::Format::Strptime(
+ pattern => '%Y-%m-%dT%TZ',
+ time_zone => 'UTC',
+ on_error => 'croak',
+);
+
+my $logger = Log::Any::get_logger;
+$logger->info("Start");
+
+my $dbh = ProxyDatabase->new( 'file' => $db );
+
+my $lastTimestamp = $dbh->getSetting('procseebot.last.timestamp');
+
+my $api = $bot->getApi;
+my %args = ( #
+ 'action' => 'query',
+ 'list' => 'logevents',
+ 'leaction' => 'block/block',
+ 'leuser' => 'ProcseeBot',
+ 'ledir' => 'newer',
+ 'lelimit' => 'max',
+);
+$args{lestart} = $lastTimestamp
+ if defined $lastTimestamp;
+
+my $iterator = $api->getIterator(%args);
+my $count = 0;
+
+$dbh->begin;
+while ( my $entry = $iterator->next ) {
+ die "Invalid user '$entry->{user}'\n"
+ unless $entry->{user} eq $user;
+
+ die "Invalid title '$entry->{title}'\n"
+ unless $entry->{title} =~ m/^User:(\d+\.\d+\.\d+\.\d+)$/;
+
+ my $exitAddress = $1;
+
+ die "Invalid comment '$entry->{comment}'\n"
+ unless $entry->{comment} =~ m/^\{\{blocked proxy\}\} <!-- (?:(\S+):)?(\d+) -->$/;
+
+ my $entryAddress = defined $1 ? $1 : $exitAddress;
+ my $port = $2;
+
+ $lastTimestamp = $entry->{timestamp};
+
+ my $now = DateTime->now;
+ my $expiry = $dateFormat->parse_datetime( $entry->{params}->{expiry} );
+
+ if ( $expiry < $now ) {
+ $logger->info("Ignoring expired block $exitAddress ($entryAddress:$port)");
+ }
+ else {
+ $logger->info("Enqueueing $entryAddress:$port for scan");
+ $dbh->insertOrIgnoreProxy( "$entryAddress:$port", $user );
+ }
+
+ $lastTimestamp = $entry->{timestamp};
+
+ $count++;
+ if ( $count % 500 == 0 ) {
+ $dbh->setSetting( 'procseebot.last.timestamp', $lastTimestamp );
+ $dbh->commit;
+ $dbh->begin;
+ }
+}
+$dbh->setSetting( 'procseebot.last.timestamp', $lastTimestamp );
+$dbh->commit;
A => beaubot/bot/proxy/proxy-import-proxyorg.pl +46 -0
@@ 0,0 1,46 @@
+#!/usr/bin/perl -w
+
+use strict;
+use FindBin qw($RealBin);
+use lib "$RealBin/..";
+use Bot4;
+use ProxyDatabase;
+use WWW::Mechanize;
+use HTML::Entities qw(decode_entities);
+
+my $db = "$RealBin/../var/proxy.sqlite";
+my $bot = new Bot4;
+
+$bot->single(1);
+$bot->addOption( "database=s", \$db, "Changes path to a database" );
+$bot->setup( 'root' => "$RealBin/.." );
+
+my $logger = Log::Any::get_logger;
+$logger->info("Start");
+
+my $dbh = ProxyDatabase->new( 'file' => $db );
+my $mech = new WWW::Mechanize;
+$mech->agent_alias('Windows Mozilla');
+
+$mech->get('http://proxy.org/proxy.js');
+
+my $content = $mech->content;
+
+my @links = $content =~ /value="([^"' ]+)"/g;
+@links = grep { $_ ne 'random' } @links;
+die "Unable to find links\n"
+ unless @links;
+
+$dbh->begin;
+foreach my $link (@links) {
+ $link = decode_entities($link);
+ $link = "http://$link"
+ unless $link =~ '://';
+
+ next if $link eq 'http://';
+
+ $logger->info("Enqueueing $link for scan");
+ $dbh->insertOrIgnoreProxy( $link, $mech->uri->as_string );
+
+}
+$dbh->commit;
A => beaubot/bot/proxy/proxy-import-xeronet.pl +74 -0
@@ 0,0 1,74 @@
+#!/usr/bin/perl -w
+
+use strict;
+use FindBin qw($RealBin);
+use lib "$RealBin/..";
+use Bot4;
+use ProxyDatabase;
+use WWW::Mechanize;
+use HTML::Entities qw(decode_entities);
+
+my $db = "$RealBin/../var/proxy.sqlite";
+my $bot = new Bot4;
+
+$bot->single(1);
+$bot->addOption( "database=s", \$db, "Changes path to a database" );
+$bot->setup( 'root' => "$RealBin/.." );
+
+my $logger = Log::Any::get_logger;
+$logger->info("Start");
+
+my $dbh = ProxyDatabase->new( 'file' => $db );
+my $mech = new WWW::Mechanize;
+$mech->agent_alias('Windows Mozilla');
+
+#$mech->proxy( [ 'http', 'https' ] => '' );
+
+my %links;
+my @pages = ( #
+ 'http://www.xeronet-proxy-list.net/type/other_proxy',
+ 'http://www.xeronet-proxy-list.net/type/glype_proxy',
+ 'http://www.xeronet-proxy-list.net',
+ #'http://www.proxywebsitelist.in',
+ #'http://freeproxylistings.com',
+);
+
+foreach my $page (@pages) {
+ eval {
+ $logger->info("Checking $page");
+ $mech->get($page);
+
+ my $content = $mech->content;
+ my $count = 0;
+
+ foreach my $link ( $content =~ m{<A href="[^">]+out\.php?[^">]+"[^>]*>\s*(http://[^>]+?)\s*</A>}isg ) {
+ $links{$link}++;
+ $count++;
+ }
+ if ($count) {
+ $logger->info("Fetched $count links");
+ }
+ else {
+ $logger->error("No links found on $page");
+ }
+ };
+ if ($@) {
+ $@ =~ s/\s+$//;
+ $logger->error("Unable to fetch page $page: $@");
+ }
+
+}
+
+die "Unable to find links\n"
+ unless keys %links;
+
+$dbh->begin;
+foreach my $link ( keys %links ) {
+ $link = decode_entities($link);
+ $link = "http://$link"
+ unless $link =~ '://';
+
+ $logger->info("Enqueueing $link for scan");
+ $dbh->insertOrIgnoreProxy( $link, $mech->uri->as_string );
+}
+$dbh->commit;
A => beaubot/bot/proxy/schema.sql +45 -0
@@ 0,0 1,45 @@
+CREATE TABLE IF NOT EXISTS proxies (
+ proxy_id INTEGER PRIMARY KEY,
+ /* An address */
+ proxy_address TEXT UNIQUE,
+ /* A timestamp of a insert */
+ proxy_added TIMESTAMP,
+ /* A timestamp of the last check */
+ proxy_checked TIMESTAMP,
+ /* A reference to a source */
+ proxy_source TEXT
+);
+
+CREATE INDEX IF NOT EXISTS proxies_checked_idx ON proxies (proxy_checked);
+
+CREATE TABLE IF NOT EXISTS checks (
+ check_proxy INTEGER REFERENCES proxies (proxy_id),
+ /* A timestamp of the attempt */
+ check_timestamp TIMESTAMP,
+ /* A check result: 0 - not working, 1 - working */
+ check_status INTEGER,
+ /* -1 - Unknown, 0 - HTTP, 1 - SOCKS4, 2 - SOCKS5, 3 - Web Proxy */
+ check_type INTEGER,
+ /* An address of the proxy */
+ check_address TEXT,
+ PRIMARY KEY(check_proxy, check_timestamp)
+);
+
+CREATE TABLE IF NOT EXISTS sessions (
+ /* A session identifier of a check */
+ session_id TEXT PRIMARY KEY,
+ /* An address of the checked proxy */
+ session_address TEXT
+);
+
+CREATE TABLE IF NOT EXISTS blocks (
+ block_id INTEGER PRIMARY KEY,
+ block_address TEXT UNIQUE,
+ block_start TIMESTAMP,
+ block_expiry TIMESTAMP
+);
+
+CREATE TABLE IF NOT EXISTS settings (
+ name TEXT PRIMARY KEY,
+ value TEXT
+);