#!/usr/local/bin/perl
#################################################################
# Program: checkwiki.pl
# Descrition: Scan all pages of a Wikipedia-Project (dump or live) for errors
# Author: Stefan Kühn
# Version: 0.2
# Licence: GPL
#################################################################
#################################################################
# Syntax
# perl -w checkwiki.pl -p=enwiki m=live
#################################################################
# New features, last changes and discussion
# http://de.wikipedia.org/wiki/Benutzer:Stefan_Kühn/Check_Wikipedia
#################################################################
# Error exception
$SIG{__DIE__} = \&die_error;
$SIG{__WARN__} = \&warn_error;
#################################################################
our $test_programm = 'true'; # only for program tests
load_moduls(); # standard perl moduls
declare_global_directorys();
declare_global_variables();
check_input_arguments();
open_file() if ($quit_program eq 'no'); #dump or live
text_translation_input() if ($quit_program eq 'no'); #dump or live
scan_pages() if ($quit_program eq 'no'); #scan all aricle
close_file();
text_translation_output() if ($quit_program eq 'no');
output_errors() if ($quit_program eq 'no');
output_statistic() if ($quit_program eq 'no');
print $quit_reason if ($quit_reason ne '');
# delete all empty description
my $sql_text2 = "delete from cw_error_desc where prio = 0;";
$sth = $dbh->prepare( $sql_text2 );
$sth->execute;
#delete all deleted article from database
$sql_text2 = "delete from cw_error where ok = 1 and project = '".$project."' and found not like '%".substr(get_time_string(), 0, 7)."%';";
print $sql_text2."\n";
$sth = $dbh->prepare( $sql_text2 );
$sth->execute;
# close database
$dbh->disconnect();
# close logfile
close (LOGFILE);
sub load_moduls{
#################################################################
# Load Module
#################################################################
#use lib "C:/perl/lib";
use strict;
use warnings;
use URI::Escape;
use LWP::UserAgent;
use CGI::Carp qw(fatalsToBrowser);
#use lib '/home/sk/perl/checkwiki';
our $file_module_coordinate = 'coordinates.pm';
if (-e $file_module_coordinate) {
use coordinates ;
}
# use new_coordinates;
#use lib '../module';
#use wikipedia;
#use URI::Escape;
#use LWP::UserAgent;
}
sub declare_global_directorys {
our $dump_directory = '/mnt/user-store/dump/'; # toolserver
# our $dump_directory = '../../dump/'; # home or usb
our $output_directory = '../../data/checkwiki/';
our $input_directory_new = '../../data/new_article/';
our $input_directory_change = '../../data/last_changes/';
our $output_templatetiger = '../../data/templatetiger/';
our $output_geo = '../../data/geo/';
#our $dump_filename = '/mnt/user-store/dump/dewiki-20080607-pages-articles.xml'; #'Wikipedia-20080502083556.xml';
#our $dump_filename = '../../dump/dewiki-20071217-pages-articles.xml';
}
sub declare_global_variables {
#################################################################
# Declaration of variables (global)
#################################################################
our $dump_or_live = ''; # scan modus (dump, live, only)
our $silent_modus = ''; # silent modus (very low output at screen) for batch
our $test_modus = ''; # silent modus (very low output at screen) for batch
our $quit_program = 'no'; # quit the program (yes,no)
our $quit_reason = ''; # quit the program reason
our $time_start = time(); # start timer in secound
our $time_end = time(); # end time in secound
our $date = 0; # date of dump "20060324"
our $line_number = 0; # number of line in dump
our $project = ''; # name of the project 'dewiki'
our $language = ''; # language of dump 'de', 'en';
our $page_number = 0; # number of pages in namesroom 0
our $base = ''; # base of article, 'http://de.wikipedia.org/wiki/Hauptseite'
our $home = ''; # base of article, 'http://de.wikipedia.org/wiki/'
our @namespace; # namespace values
# 0 number
# 1 namespace in project language
# 2 namespace in english language
our $namespaces_count = -1; # number of namespaces
our @namespacealiases; # namespacealiases values
# 0 number
# 1 namespacealias
our $namespacealiases_count= -1; # number of namespacealiases
our @namespace_cat; #all namespaces for categorys
our @namespace_image; #all namespaces for images
our @namespace_templates; #all namespaces for templates
our @magicword_defaultsort;
our @magicword_img_thumbnail;
our @magicword_img_manualthumb;
our @magicword_img_right;
our @magicword_img_left;
our @magicword_img_none;
our @magicword_img_center;
our @magicword_img_framed;
our @magicword_img_frameless;
our @magicword_img_page;
our @magicword_img_upright;
our @magicword_img_border;
our @magicword_img_sub;
our @magicword_img_super;
our @magicword_img_link;
our @magicword_img_alt;
our @magicword_img_width;
our @magicword_img_baseline;
our @magicword_img_top;
our @magicword_img_text_top;
our @magicword_img_middle;
our @magicword_img_bottom;
our @magicword_img_text_bottom;
# Wiki-special variables
our @live_article; # to-do-list for live (all articles to scan)
our $current_live_article = -1; # line_number_of_current_live_article
our $number_of_live_tests = -1; # Number of articles for live test
our $current_live_error_scan = -1; # for scan every 100 article of an error
our @live_to_scan ; # article of one error number which should be scanned
our $number_article_live_to_scan = -1; # all article from one error
our @article_was_scanned; #if an article was scanned, this will insert here
our $xml_text_from_api = ''; # the text from more then one articles from the API
our $error_counter = -1; # number of found errors in all article
our @page_with_error;
our @error_description; # Error Description
# 0 priority
# 1 title in English
# 2 description in English
# 3 number of found (only live scanned)
# 4 priority of foreign language
# 5 title in foreign language
# 6 description in foreign language
# 7 number of found in last scan (from statistic file)
# 8 all known errors (from statistic file + live)
our $number_of_max_errors = 100; # number of max error_description
for (my $i = 0; $i <= $number_of_max_errors; $i++) {
$error_description[$i][0] = -1;
$error_description[$i][1] = '';
$error_description[$i][2] = '';
$error_description[$i][3] = 0;
$error_description[$i][4] = -1;
$error_description[$i][5] = '';
$error_description[$i][6] = '';
$error_description[$i][7] = 0;
$error_description[$i][8] = 0;
}
our $max_error_count = 50; # maximum of shown article per error
our $maximum_current_error_scan = -1; # how much shold be scanned for reach the max_error_count
our $rest_of_errors_not_scan_yet = '';
our $number_of_all_errors_in_all_articles = 0; #all errors
our $for_statistic_new_article = 0;
our $for_statistic_last_change_article = 0;
our $for_statistic_geo_article = 0;
our $for_statistic_number_of_articles_with_error = 0;
our $error_geo_counter = -1; # number of found errors in all article
our @page_with_geo_error;
our @error_geo_description;
our $number_of_max_geo_errors = 100;
for (my $i = 0; $i <= $number_of_max_geo_errors; $i++) {
$error_geo_description[$i][0] = -1;
$error_geo_description[$i][1] = '';
$error_geo_description[$i][2] = '';
$error_geo_description[$i][3] = 0;
$error_geo_description[$i][4] = -1;
$error_geo_description[$i][5] = '';
$error_geo_description[$i][6] = '';
$error_geo_description[$i][7] = 0;
$error_geo_description[$i][8] = 0;
}
our $live_filename = 'input_for_live.txt';
our $output_live_wiki = 'output_for_wikipedia.txt';
our $output_dump_wiki = 'output_for_wikipedia_dump.txt';
our $error_list_filename = 'error_list.txt';
our $error_list_filename_only = 'error_list_only.txt';
our $error_list_filename_dump = 'error_list_dump.txt'; #all errors from the last dump scan
our$error_list_filename_backup = 'error_list_dump_backup.txt';
our $error_statistic_filename = 'error_statistic.txt';
our $error_statistic_filename_only = 'error_statistic_only.txt';
our $error_statistic_filename_list = 'error_statistic_list.txt';
our $translation_file = 'translation.txt';
our $error_list_filename_30 = 'error_list_error_030.txt';
our $error_list_filename_every = 'error_list_error'; # for all errors
our $error_geo_list_filename = 'error_geo_list.txt';
our $error_geo_list_filename_only = 'error_geo_list_only.txt';
our $error_geo_list_filename_html = 'error_geo_list.htm';
our $error_geo_list_filename_only_html = 'error_geo_list_only.htm';
our $log_file = 'log.txt';
our @inter_list = ( 'af', 'als', 'an', 'ar',
'bg', 'bs',
'ca', 'cs', 'cy',
'da', 'de',
'el', 'en', 'eo', 'es', 'et', 'eu',
'fa', 'fi', 'fr', 'fy',
'gl', 'gv',
'he', 'hi', 'hr', 'hu',
'id', 'is', 'it',
'ja', 'jv',
'ka', 'ko',
'la', 'lb', 'lt',
'ms',
'nds', 'nds_nl', 'nl', 'nn', 'no',
'pl', 'pt',
'ro', 'ru',
'sh', 'simple', 'sk', 'sl', 'sr', 'sv', 'sw',
'ta', 'th', 'tr',
'uk', 'ur',
'vi', 'vo',
'yi',
'zh'
);
our @foundation_projects = ( 'wikibooks', 'b',
'wiktionary', 'wikt',
'wikinews', 'n',
'wikiquote', 'q',
'wikisource', 's',
'wikipedia', 'w',
'wikispecies', 'species',
'wikimedia', 'foundation', 'wmf',
'wikiversity', 'v',
'commons',
'meta', 'metawikipedia', 'm',
'incubator',
'mw',
'quality',
'bugzilla', 'mediazilla',
'nost',
'testwiki'
);
# current time
get_time();
our $translation_page = ''; # name of the page with translation for example in de: "Wikipedia:WikiProject Check Wikipedia/Übersetzung"
our $start_text = '';
$start_text = $start_text ."The WikiProject '''Check Wikipedia''' will help to clean up the syntax of Wikipedia and to find some other errors.\n";
$start_text = $start_text ."\n";
$start_text = $start_text ."'''Betatest''' - At the moment the script has some bugs and not every error on this page is an actual error. \n";
$start_text = $start_text ."\n";
our $description_text = '';
$description_text = $description_text ."== Project description in English == \n";
$description_text = $description_text ."* '''What is the goal of this project?'''\n";
$description_text = $description_text ."** This project should help to clean up the data of all articles in many different languages.\n";
$description_text = $description_text ."** If we have a clear and clean syntax in all articles more projects (for example: Wikipedia-DVD) can use our data more easily.\n";
$description_text = $description_text ."** The project was inspired by [[:en:Wikipedia:WikiProject Wiki Syntax]].\n";
$description_text = $description_text ."** In order to use the data of a Wikipedia project without the Mediawiki software you need to write a parser. If many articles include wrong syntax it is difficult to program the parser since it needs to be complex enough to recognize the syntax errors.\n";
$description_text = $description_text ."** This project helps to find many errors in all kinds of language and will support many languages in the future. \n";
$description_text = $description_text ."\n";
$description_text = $description_text ."* '''How does it work?'''\n";
$description_text = $description_text ."** The script scans every new [http://dumps.wikimedia.org dump] and creates a list of articles with errors.\n";
$description_text = $description_text ."** The script scans all articles on the list on a daily basis to create a new list for users, omitting already-corrected articles.\n";
$description_text = $description_text ."** The script is written in Perl by: [[:de:User:Stefan Kühn|Stefan Kühn]] "."\n";
$description_text = $description_text ."** You can download the script [http://toolserver.org/~sk/checkwiki/checkwiki.pl here]. It is licensed under GPL."."\n";
$description_text = $description_text ."** [[:de:User:Stefan Kühn/Check Wikipedia|New features, last changes and discussion]]. "."\n";
$description_text = $description_text ."\n";
$description_text = $description_text ."* '''What can you do?'''\n";
$description_text = $description_text ."** The script creates a new error page at the toolserver every day. Please copy and paste the daily updated page at the toolserver (See downloads) to this page here. Attention: That page is a UTF-8 document. In case your browser cannot display the file in UTF-8 you can copy it into a text editor (for example: Notepad++) and convert it to UTF-8. \n";
$description_text = $description_text ."** You can fix an error in one or more articles. \n";
$description_text = $description_text ."** You can delete all fixed articles from this list. \n";
$description_text = $description_text ."** If all articles in one category have been fixed you can delete this category. \n";
$description_text = $description_text ."** You can suggest a new category of errors to the author of the script. \n";
$description_text = $description_text ."** You can also inform the author if you want this project to be implemented into your language's Wikipedia. \n";
$description_text = $description_text ."\n";
$description_text = $description_text ."* '''Please don't… '''\n";
$description_text = $description_text ."** insert an article by hand since it will disappear from the list with the next automatic update of this page. \n";
$description_text = $description_text ."** try to fix spelling mistakes within this page since all manual changes will disappear as well with the next update. Instead, send an e-mail or message to the author so he can fix the spelling in the script. \n";
$description_text = $description_text ."\n";
our $category_text = '';
our $top_priority_script = 'Top priority';
our $top_priority_project = '';
our $middle_priority_script = 'Middle priority';
our $middle_priority_project = '';
our $lowest_priority_script = 'Lowest priority';
our $lowest_priority_project = '';
}
sub get_time{
our ($akSekunden, $akMinuten, $akStunden, $akMonatstag, $akMonat,
$akJahr, $akWochentag, $akJahrestag, $akSommerzeit) = localtime(time);
our $CTIME_String = localtime(time);
$akMonat = $akMonat + 1;
$akJahr = $akJahr + 1900;
$akMonat = "0".$akMonat if ($akMonat<10);
$akMonatstag = "0".$akMonatstag if ($akMonatstag<10);
$akStunden = "0".$akStunden if ($akStunden<10);
$akMinuten = "0".$akMinuten if ($akMinuten<10);
}
sub get_time_string{
my ($aakSekunden, $aakMinuten, $aakStunden, $aakMonatstag, $aakMonat,
$aakJahr, $aakWochentag, $aakJahrestag, $aakSommerzeit) = localtime(time);
$aakMonat = $aakMonat + 1;
$aakJahr = $aakJahr + 1900;
$aakMonat = "0".$aakMonat if ($aakMonat<10);
$aakMonatstag = "0".$aakMonatstag if ($aakMonatstag<10);
$aakStunden = "0".$aakStunden if ($aakStunden<10);
$aakMinuten = "0".$aakMinuten if ($aakMinuten<10);
$aakSekunden = "0".$aakSekunden if ($aakSekunden<10);
my $result = $aakJahr.$aakMonat.$aakMonatstag.'_'.$aakStunden.$aakMinuten.$aakSekunden;
return($result);
}
sub check_input_arguments{
#################################################################
# Declaration of parameters (extern)
#################################################################
if ( @ARGV < 1) {
# no parameters
$quit_reason = $quit_reason. 'no parameters'."\n\n";
$quit_program = 'yes';
}
###################
#check argument value for project
my $found_argv = 'no';
foreach (@ARGV) {
my $current_argv = $_;
if ( index($current_argv, 'p=') == 0) {
$found_argv = 'yes';
$project = $current_argv;
$project =~ s/^p=//;
$language = $project;
$language =~ s/wiki//;
}
}
if ($found_argv eq 'no'){
# no project name
$quit_reason = $quit_reason. 'no project name, for example: "p=dewiki"'."\n\n";
$quit_program = 'yes';
}
####################
#check argument value for scanmodus
$found_argv = 'no';
foreach (@ARGV) {
my $current_argv = $_;
if ( $current_argv eq 'm=dump'
or $current_argv eq 'm=live'
or $current_argv eq 'm=only' )
{
$found_argv = 'yes';
$dump_or_live = $current_argv;
$dump_or_live =~ s/^m=//;
}
}
if ($found_argv eq 'no'){
#no scan modus
$quit_reason = $quit_reason. 'modus unknown, for example: "m=dump/live/only"'."\n\n";
$quit_program = 'yes';
}
####################
#check argument value for silent or test
$found_argv = 'no';
foreach (@ARGV) {
my $current_argv = $_;
$silent_modus = 'silent' if ( $current_argv eq 'silent' );
$test_modus = 'test' if ( $current_argv eq 'test');
}
if ($quit_program eq 'yes'){
#End of Script, because no correct parameter
$quit_reason = $quit_reason.'Use for scan a dump'."\n";
$quit_reason = $quit_reason.'perl -w checkwiki.pl p=dewiki m=dump'."\n";
$quit_reason = $quit_reason.'perl -w checkwiki.pl p=nds_nlwiki m=dump'."\n\n";
$quit_reason = $quit_reason.'perl -w checkwiki.pl p=nds_nlwiki m=dump silent'."\n\n";
$quit_reason = $quit_reason.'perl -w checkwiki.pl p=nds_nlwiki m=dump silent test'."\n\n";
$quit_reason = $quit_reason.'Use for scan a list of pages live'."\n";
$quit_reason = $quit_reason.'perl -w checkwiki.pl p=dewiki m=live'."\n";
$quit_reason = $quit_reason.'perl -w checkwiki.pl p=dewiki m=live silent'."\n";
$quit_reason = $quit_reason.'perl -w checkwiki.pl p=dewiki m=live silent test'."\n";
$quit_reason = $quit_reason."\n";
} else {
# All parameters available and correct
# extract parameters
print "\n";
if ($silent_modus ne 'silent') {
print '##################################################'."\n";
print '######## checkwiki.pl - Version 0.2 ########'."\n";
}
print '##################################################'."\n";
print 'Start: '."\t\t".$akJahr.'-'.$akMonat.'-'.$akMonatstag.' '.$akStunden.':'.$akMinuten."\n";
print 'Project:'."\t\t". $project."\n";
if ($silent_modus ne 'silent') {
print 'Modus: '."\t\t". $dump_or_live. ' (';
print 'scan a dump' if ($dump_or_live eq 'dump');
print 'scan live' if ($dump_or_live eq 'live');
print 'scan a dump only some errors' if ($dump_or_live eq 'only');
print ')'."\n";
}
$project = $project.'_test' if ($test_modus eq 'test');
print "\t\t\t".'Test-Modus --> '.$project.'!!!'."\n" if ($test_modus eq 'test');
if ( $project eq 'frwiki') {
$max_error_count = 100;
print 'SPECIAL: max_error_count = 100'."\n";
}
}
#################################################################
# DB
#################################################################
use DBI;
#load password
open(PWD, ";
if ($test =~ /^pass=/ ) {
$password = $test;
$password =~ s/^pass=//g;
$password =~ s/\n//g;
}
}
while (eof(PWD) != 1);
close(PWD);
#print "-".$password."-\n";
#Connect to database u_sk
our $dbh = DBI->connect( 'DBI:mysql:u_sk_yarrow:host=sql',
'sk',
$password ,
{
RaiseError => 1,
AutoCommit => 1
}
) or die "Database connection not made: $DBI::errstr" . DBI->errstr;
print "Content-type: text/html\n\n";
$password = '';
# check for description in database
for(my $i = 0; $i <= 150; $i ++) {
# show for the description of one error
my $sql_text2 = "select count(*) from cw_error_desc where id = ".$i." and project = '".$project."';";
#print $sql_text2."\n";
$sth = $dbh->prepare( $sql_text2 );
$sth->execute;
my $result = 0;
while (my $arrayref = $sth->fetchrow_arrayref()) {
foreach(@$arrayref) {
$result = $_;
}
}
if ($result == 0) {
# insert line for a not existing error ( later in the script this line will be deleted, if no name is inserted )
#print 'insert '.$i."\n";
my $sql_text2 = "insert into cw_error_desc (project, id, prio, name, text) values ( '".$project."', " .$i. ", 0, '' ,'');";
#print $sql_text2."\n";
$sth = $dbh->prepare( $sql_text2 );
$sth->execute;
}
}
}
sub open_file{
# create subdirectory
#print $output_directory.$project."\n";
if (not (-e $output_directory.$project )) {
print 'create directory:'."\t". $output_directory.$project."\n";
mkdir($output_directory.$project ,0777);
}
################################
# open logfile
my $log_filename = $output_directory.$project.'/'.$project.'_'.$log_file;
open (LOGFILE, '+>'.$log_filename);
################################
# if new dump is available
if ($dump_or_live eq 'dump') {
$dump_filename = search_for_last_dump();
print 'Dump_filename:'."\t\t".$dump_filename."\n" if ($silent_modus ne 'silent');
my $last_dump_filename = $output_directory.$project.'/'.$project.'_last_dump_name.txt';
#print $last_dump_filename."\n";
if (not (-e $last_dump_filename)) {
# create the file if not exist
print 'create last_dump_file:'."\t".$project.'_last_dump_name.txt'."\n";
open (LAST_DUMP_NAME_FIRST, '>'.$last_dump_filename);
print LAST_DUMP_NAME_FIRST 'x';
close(LAST_DUMP_NAME_FIRST);
}
#read the last name
#print 'check old dumpname'."\n";
open (LAST_DUMP_NAME, '<'.$last_dump_filename);
my $last_dump_name_old = '';
$last_dump_name_old = ;
#$last_dump_name_old = '' if not defined;
$last_dump_name_old =~ s/\n//g;
close(LAST_DUMP_NAME);
#get date from dumpfile
our $dump_date_for_output = $dump_filename;
$dump_date_for_output =~ s/^[^\-]-//g;
$dump_date_for_output =~ s/^[^0-9]+//g;
$dump_date_for_output =~ s/[^0-9]+$//g;
$dump_date_for_output = substr($dump_date_for_output,0,4).'-'.substr($dump_date_for_output,4,2).'-'.substr($dump_date_for_output,6,2);
#print $dump_date_for_output."\n";
if ($dump_filename ne $last_dump_name_old ) {
# if not the newest dump then start dump scan
print 'Last: '."\t\t". $last_dump_name_old."\n";
print 'Current: '."\t\t". $dump_filename."\n";
open (LAST_DUMP_NAME, '>'.$last_dump_filename);
print LAST_DUMP_NAME $dump_filename;
close(LAST_DUMP_NAME);
#print 'nice -n 5 perl -w checkwiki.pl p='.$project.' m=dump' ."\n";
# if ($dump_or_live eq 'live') {
# print "\n\n";
# system ('nice -n 5 perl -w checkwiki.pl p='.$project.' m=dump silent') ;
# print "\n\n";
# }
}
#insert error in database
my $sql_text = "update cw_project set last_dump ='".$dump_date_for_output."' where project = '". $project ."';";
my $sth = $dbh->prepare( $sql_text );
$sth->execute;
}
################################
if ($dump_or_live eq 'dump' or $dump_or_live eq 'only') {
#print "lsat=x".$dump_filename."x\n";
# check for existens dump
if ($dump_filename ne '' and -e "$dump_directory$dump_filename") {
#print 'Data: '."\t\t"."$dump_directory$dump_filename\n";
#open dump
open(DUMP, "<$dump_directory/$dump_filename");
read_and_write_metadata_from_dump();
} else {
$quit_program = 'yes';
$quit_reason = $quit_reason. "file '$dump_directory$dump_filename'". " don't exist!\n";
}
# Templatetiger
our $templatetiger_filename = $output_templatetiger.$project.'/'.$project.'_templatetiger.txt';
if (not (-e $output_templatetiger.$project )) {
print 'create new subdirectory'."\t".'templatetiger'."\n";
mkdir($output_templatetiger.$project ,0777);
}
if (-e $templatetiger_filename ) {
#print 'Delete '.$templatetiger_filename."\n";
system ('rm -f '.$templatetiger_filename) ;
}
open (TEMPLATETIGER, '>>'.$templatetiger_filename);
#GEO Export
our $geo_export_filename = $output_geo.$project.'/'.$project.'_coordinates.txt';
if (not (-e $output_geo.$project )) {
print 'create new subdirectory'."\t".'geo'."\n";
mkdir($output_geo.$project ,0777);
}
if (-e $geo_export_filename ) {
print 'Delete '.$geo_export_filename."\n";
system ('rm -f '.$geo_export_filename) ;
}
}
if ($dump_or_live eq 'live' ) {
# open list for live
#print 'Data: '."\t\t".$output_directory.$project.'/'.$project.'_'.$error_list_filename ."\n";
if (not (-e $output_directory.$project.'/'.$project.'_'.$error_list_filename )){
$quit_program = 'yes';
$quit_reason = $quit_reason. "file:" .$output_directory.$project.'/'.$project.'_'.$error_list_filename. " don't exist!\n";
} else {
#read articles(live)
article_last_scan(); # get all article from last scan, where the script found errors
new_article(); # get all new article last days
last_change_article(); # get all new article last days
geo_error_article(); # get all articles with geo errors last days
article_with_error_from_dump_scan(); # get all articles error from the last dump scan
get_done_article_from_database(); # get all article which is done in the database
get_oldest_article_from_database();
# sort all articles (new + live)
@live_article = sort(@live_article);
$number_of_live_tests = @live_article;
# delet all double/multi input article
my @new_live_article;
my @split_line;
my @split_line_old;
my $old_title = '';
my $all_errors_of_this_article = '';
my $i = -1;
$number_of_live_tests = @live_article;
foreach (@live_article) {
@split_line_old = @split_line;
@split_line = split(/\t/, $_);
my $current_title = $split_line[0];
$split_line[1] =~ s/\n//;
#print $current_title."\n";
my $number_of_split_line = @split_line;
if ($number_of_split_line != 2) {
print 'Problem with input line:'."\n";
print $_."\n";
die;
};
if ($old_title ne $current_title
and $old_title ne ''){
#save old
$i = $i+1;
$new_live_article[$i] = $old_title."\t".$all_errors_of_this_article;
$all_errors_of_this_article = '';
#print "result:".$new_live_article[$i]."\n";
}
# check new
if ($old_title eq $current_title) {
#double
$all_errors_of_this_article = $all_errors_of_this_article.', '.$split_line[1];
#print 'double: '.$current_title."\t".$all_errors_of_this_article."\n";
} else {
$all_errors_of_this_article = $split_line[1];
#print 'normal: '.$current_title."\t".$all_errors_of_this_article."\n";
}
$old_title = $current_title;
}
#save last
$i = $i+1;
$new_live_article[$i] = $old_title."\t".$all_errors_of_this_article;
@live_article = @new_live_article;
$number_of_live_tests = @live_article;
print 'articles without double'."\t".$number_of_live_tests."\n";
print LOGFILE 'articles without double'."\t".$number_of_live_tests."\n";
@new_live_article = (); # free memory
@split_line = (); # free memory
#foreach (@live_article) {
# print LOGFILE $_."\n";
#}
#print LOGFILE 'END LIST'."\n\n";
}
}
# delete old error_list
if ($quit_program eq 'no' ) {
read_and_write_metadata_from_dump();
load_metadata_from_file();
}
}
sub article_last_scan{
my $file_input_live = $output_directory.$project.'/'.$project.'_'.$error_list_filename;
#print $file_input_live."\n";
open(LIVE, "<$file_input_live");
@live_article = ;
close (LIVE);
$number_of_live_tests = @live_article;
print 'articles last scan:'."\t".$number_of_live_tests."\n";
print LOGFILE 'articles last scan:'."\t".$number_of_live_tests."\n";
}
sub new_article{
# Load new articles
my $file_new = $project.'_new_article.txt';
my $file_input_new = $input_directory_new.$project.'/'.$file_new;
#print $file_input_new."\n";
my $new_counter = 0;
if (-e $file_input_new) {
#if existing
open(INPUT_NEW, "<$file_input_new");
do {
my $line = ;
$line =~ s/\n$//g;
my @split_line = split ( /\t/, $line);
push(@live_article, $split_line[1]."\t".'0' );
#print $split_line[1]."\t".'0'."\n";
$new_counter ++;
}
until (eof(INPUT_NEW) == 1);
close (INPUT_NEW);
}
print 'articles new:'."\t\t".$new_counter;
print ' (no file: '.$file_new.' )' if not (-e $file_input_new);
print "\n";
print LOGFILE 'articles new:'."\t\t".$new_counter. "\n";
$for_statistic_new_article = $new_counter;
}
sub last_change_article{
# Load last change articles
my $file_last_change = $project.'_last_changes.txt';
my $file_input_last_change = $input_directory_change.$project.'/'.$file_last_change;
#print $file_input_new."\n";
my $change_counter = 0;
if (-e $file_input_last_change) {
#if existing
#print 'file exist'."\n";
open(INPUT_NEW, "<$file_input_last_change");
do {
my $line = ;
if ($line) {
$line =~ s/\n$//g;
my @split_line = split ( /\t/, $line);
push(@live_article, $split_line[1]."\t".'0' );
$change_counter ++;
}
}
until (eof(INPUT_NEW) == 1);
close (INPUT_NEW);
}
print 'articles change:'."\t".$change_counter;
print ' (no file: '.$file_last_change.' )' if not (-e $file_input_last_change);
print "\n";
print LOGFILE 'articles change:'."\t".$change_counter."\n";
our $for_statistic_last_change_article = $change_counter;
}
sub geo_error_article{
# get all last_change article last days
# Load last change articles
my $file_geo = $project.'_'.$error_geo_list_filename;
my $file_input_geo = $output_geo.$project.'/'.$file_geo;
#print $file_input_new."\n";
my $geo_counter = 0;
if (-e $file_input_geo) {
#if existing
#print 'file exist'."\n";
open(INPUT_GEO, "<$file_input_geo");
do {
my $line = ;
if ($line) {
$line =~ s/\n$//g;
my @split_line = split ( /\t/, $line);
my $number_of_parts = @split_line;
if ( $number_of_parts > 0 ) {
push(@live_article, $split_line[0]."\t".'0' );
$geo_counter ++;
}
}
}
until (eof(INPUT_GEO) == 1);
close (INPUT_GEO);
}
print 'articles geo:'."\t\t".$geo_counter;
print ' (no file: '.$file_geo.' )' if not (-e $file_input_geo);
print "\n";
print LOGFILE 'articles geo:'."\t\t".$geo_counter."\n";
$for_statistic_geo_article = $geo_counter;
}
sub article_with_error_from_dump_scan{
if ( $dump_or_live eq 'live') {
# if a new dump is available
my $input_dump_errors = $output_directory.$project.'/'.$project.'_'.$error_list_filename_dump;
#print $file_input_new."\n";
my $dump_counter = 0;
if (-e $input_dump_errors) {
#if existing
#print 'file exist'."\n";
open(INPUT_DUMP, "<$input_dump_errors");
do {
my $line = ;
if ($line) {
$line =~ s/\n$//g;
my @split_line = split ( /\t/, $line);
my $number_of_parts = @split_line;
if ( $number_of_parts > 0 ) {
push(@live_article, $split_line[0]."\t".$split_line[1] );
$dump_counter ++;
}
}
}
until (eof(INPUT_DUMP) == 1);
close (INPUT_DUMP);
# delete
system ('rm '.$input_dump_errors);
}
print 'articles dump:'."\t\t".$dump_counter."\n";
print LOGFILE 'articles dump:'."\t\t".$dump_counter."\n";
}
}
sub get_done_article_from_database{
my $database_ok_counter = 0;
my $sql_text = " select title from cw_error where ok = 1 and project = '".$project."' limit 2500;";
my $result = '';
my $sth = $dbh->prepare( $sql_text );
#print ''.$sql_text."
\n";
$sth->execute;
while (my $arrayref = $sth->fetchrow_arrayref()) {
foreach(@$arrayref) {
$result = $_;
}
#print $result."\n";
push(@live_article, $result."\t".'0' );
$database_ok_counter ++;
}
print 'done articles from db:'."\t\t".$database_ok_counter."\n";
print LOGFILE 'done articles from db:'."\t\t".$database_ok_counter."\n";
}
sub get_oldest_article_from_database{
my $database_ok_counter = 0;
my $sql_text = " select title from cw_error where project = '".$project."' order by found asc limit 500;";
my $result = '';
my $sth = $dbh->prepare( $sql_text );
#print ''.$sql_text."\n";
$sth->execute;
while (my $arrayref = $sth->fetchrow_arrayref()) {
foreach(@$arrayref) {
$result = $_;
}
#print $result."\n";
push(@live_article, $result."\t".'0' );
$database_ok_counter ++;
}
print 'old articles from db:'."\t\t".$database_ok_counter."\n";
print LOGFILE 'old articles from db:'."\t\t".$database_ok_counter."\n";
}
sub search_for_last_dump {
# search in dump_directory for the last XML-file of a project
my $last_file ='';
my @xml_files = glob($dump_directory.'*.xml');
my $count_xml_files = @xml_files;
for (my $i = 0; $i < $count_xml_files; $i++) {
# List of all xml-files in dump_directory
my $byte = -s $xml_files[$i];
#print $xml_files[$i].' '.$byte."\n";
$xml_files[$i] =~ s/(.)+\///g;
my $project_test = $project;
$project_test =~ s/_test$//;
if (( index($xml_files[$i], $project.'-') == 0 # only this project
or index($xml_files[$i], $project_test.'-') == 0 ) #
and $byte > 0 ) { # only more then 0 bytes files
#the last project dump (more then 0 byte)
#print "\t".$xml_files[$i]."\n";
$last_file = $xml_files[$i];
}
}
if ($last_file eq '' and $dump_or_live ne 'live') { # stop if dump scan , run if the program will scan live
# No file found
$quit_program = 'yes';
$quit_reason = $quit_reason.$count_xml_files.' XML-files found in folder '.$dump_directory."\n";
$quit_reason = $quit_reason.'Found no XML-file for project: '.$project."\n";
}
@xml_files = (); # free memory
return($last_file);
}
############################################################################
sub scan_pages{
# get the text of the next page
print 'Start scanning'."\n" if ($silent_modus ne 'silent');
our $end_of_dump = 'no'; # when last article from dump scan then 'yes', else 'no'
our $end_of_live = 'no'; # when last article from live scan then 'yes', else 'no'
do {
set_variables_for_article();
if ($dump_or_live eq 'dump' or $dump_or_live eq 'only') {
get_next_page_from_dump();
} else {
get_next_page_from_live();
}
if ( $end_of_dump eq 'no'
and $end_of_live eq 'no'
and not ( $title =~ /\.js$/
or $title =~ /\.css$/
)
)
{
check_article(); #Main check routine
} else {
if ( $end_of_dump eq 'yes'
or $end_of_live eq 'yes' ) {
print 'articles scan finish'."\n\n" if ($silent_modus ne 'silent');
} else {
print 'no check in article:'."\t\t".$title."\n";
}
}
}
until ( $end_of_dump eq 'yes'
or $end_of_live eq 'yes'
#or $page_number > 20
#or $page_id > 7950
#or ($error_counter > 10000 and $project ne 'dewiki')
or ($error_counter > 40000 and $dump_or_live eq 'live')
);
}
sub set_variables_for_article {
$page_number = $page_number + 1;
our $title = ''; # title of the current article
our $page_id = -1; # page id of the current article
our $revision_id = -1; # revision id of the current article
our $revision_time = -1; # revision time of the current article
our $text = ''; # text of the current article (for work)
our $text_origin = ''; # text of the current article origin (for save)
our $text_without_comments = ''; # text of the current article without_comments (for save)
our $page_namespace = -100; # namespace of page
our $page_is_redirect = 'no';
our $page_is_disambiguation = 'no';
our $page_categories = '';
our $page_interwikis = '';
our $page_has_error = 'no'; # yes/no error in this page
our $page_error_number = -1; # number of all article for this page
our @comments; # 0 pos_start
# 1 pos_end
# 2 comment
our $comment_counter = -1; #number of comments in this page
our @category; # 0 pos_start
# 1 pos_end
# 2 category Test
# 3 linkname Linkname
# 4 original [[Category:Test|Linkname]]
our $category_counter = -1;
our $category_all = ''; # all categries
our @interwiki; # 0 pos_start
# 1 pos_end
# 2 interwiki Test
# 3 linkname Linkname
# 4 original [[de:Test|Linkname]]
# 5 language
our $interwiki_counter = -1;
our @lines; # text seperated in lines
our @headlines; # headlines
our @section; # text between headlines
undef(@section);
our @lines_first_blank; # all lines where the first character is ' '
our @templates_all; # all templates
our @templates; # templates with values
# 0 number of template
# 1 templatename
# 2 template_row
# 3 attribut
# 4 value
our $number_of_template_parts = -1; # number of all template parts
our @links_all; # all links
our @images_all; # all images
our @isbn; # all ibsn of books
our @ref; # all ref
our $page_has_geo_error = 'no'; # yes/no geo error in this page
our $page_geo_error_number = -1; # number of all article for this page
our $details_for_page = 'no'; # yes/no durring the scan you can get more details for a article scan
}
sub close_file {
#close all open files
close (DUMP);
close (TEMPLATETIGER);
}
sub read_and_write_metadata_from_dump {
# read the metadata from dump (…)
# write this metadata in file for dump and live-scan
#print 'Read metadata from dump and write in file'."\n";
#old from dump
# my $line ='';
# my $end = 'no';
my $metadata = '';
# do {
# $line_number = $line_number + 1;
# $line = ;
# #print $line_number.' '.$line;
# $line =~ s/\n//;
# $metadata = $metadata.$line."\n";
# if (index ($line, '') > -1) {
# $end = 'yes';
# }
#
# }
# until ( $end eq 'yes');
#new from web
# raw_text2
#print 'get Metadaten from :'.$project.' '.$language."\n";
$language = 'nds-nl' if ($project eq 'nds_nlwiki');
my $url = 'http://'.$language.'.wikipedia.org/w/api.php?action=query&meta=siteinfo&siprop=general|namespaces|namespacealiases|statistics|magicwords&format=xml';
if ($project eq 'commonswiki') {
$url = 'http://commons.wikimedia.org/w/api.php?action=query&meta=siteinfo&siprop=general|namespaces|namespacealiases|statistics|magicwords&format=xml';
}
$metadata = raw_text2($url);
$language = 'nds_nl' if ($project eq 'nds_nlwiki');
my $file_metadata = $output_directory.$project.'/'.$project.'_metadata.txt';
#print $file_metadata."\n";
open(METADATA, ">$file_metadata");
print METADATA $metadata;
close(METADATA);
$metadata = '';
}
sub load_metadata_from_file {
# load metadata from file for dump and live
# this file is from the last dump (if live) or current dump (if dump)
#print 'Read metadata from file'."\n";
my $file_metadata = $output_directory.$project.'/'.$project.'_metadata.txt';
open(METADATA, "<$file_metadata");
my @metadata = ;
close(METADATA);
my $metatext = '';
foreach (@metadata) {
$metatext = $metatext.$_;
}
#print $metatext."\n";
#Extract metadata
#sitename
my $sitename = '';
my $pos1 = index($metatext,'sitename="') + length('sitename="');
my $pos2 = index($metatext,'"', $pos1);
$sitename = substr($metatext, $pos1, $pos2 - $pos1);
print 'Sitename: '."\t\t".$sitename."\n" if ($silent_modus ne 'silent');
#base
$base = '';
$pos1 = index($metatext,'base="') + length('base="');
$pos2 = index($metatext,'"', $pos1 );
$base = substr($metatext, $pos1, $pos2 -$pos1);
print 'Base: '."\t\t".$base."\n" if ($silent_modus ne 'silent');
$home = $base;
$home =~ s/[^\/]+$//;
#print 'Home: '."\t\t".$home."\n";
#namespace
my $namespaces = '';
$pos1 = index($metatext,'') + length('');
$pos2 = index($metatext,'', $pos1);
$namespaces = substr($metatext, $pos1, $pos2 -$pos1);
#print "x".$namespaces."x\n";
#$namespaces =~ s/^\n//g;
$namespaces =~ s/<\/ns>/\n/g;
$namespaces =~ s/" subpages="" //g;
$namespaces =~ s//\t/g;
$namespaces =~ s/" \/>/\t\n/g;
#$namespaces =~ s/ //g;
#print "x".$namespaces."x\n";
my @namespaces_split = split( /\n/, $namespaces);
$namespaces_count = @namespaces_split;
#print $namespaces_count;
for (my $i = 0; $i < $namespaces_count; $i++) {
#print $i."\t".$namespaces_split[$i]."\n";
my @splitter = split( /\t/, $namespaces_split[$i]);
if ( $namespaces_split[$i] =~ /^0/) {
$namespace[$i][0] = 0;
} else {
$namespace[$i][0] = int($splitter[0]);
}
$namespace[$i][1] = $splitter[2];
$namespace[$i][1] = '' if ($namespace[$i][0] == 0);
$namespace[$i][2] = $splitter[1];
$namespace[$i][2] = '' if ($namespace[$i][0] == 0);
if ($namespace[$i][0] == 6) {
# image
$namespace_image[0] = $namespace[$i][1];
$namespace_image[1] = $namespace[$i][2];
}
if ($namespace[$i][0] == 10) {
# templates
$namespace_templates[0] = $namespace[$i][1];
$namespace_templates[1] = $namespace[$i][2] if ($namespace[$i][1] ne $namespace[$i][2]);
}
if ($namespace[$i][0] == 14) {
#category
$namespace_cat[0] = $namespace[$i][1];
$namespace_cat[1] = $namespace[$i][2] if ($namespace[$i][1] ne $namespace[$i][2]);
}
}
# namespacealiases
my $namespacealiases_text = '';
$pos1 = index($metatext,'') + length('');
$pos2 = index($metatext,'', $pos1);
$namespacealiases_text = substr($metatext, $pos1, $pos2 -$pos1);
#print $namespacealiases_text. "\n";
$namespacealiases_text =~ s/<\/ns>/\n/g;
$namespacealiases_text =~ s//\t/g;
#print $namespacealiases_text. "\n";
my @namespacealiases_split = split( /\n/, $namespacealiases_text);
$namespacealiases_count = @namespacealiases_split;
#print $namespaces_count;
for (my $i = 0; $i < $namespacealiases_count; $i++) {
my @splitter = split( /\t/, $namespacealiases_split[$i]);
if ($splitter[0] eq '6') {
#aliasname for image
push(@namespace_image, $splitter[1]);
}
if ($splitter[0] eq '10') {
#aliasname for templates
push(@namespace_templates, $splitter[1]);
}
if ($splitter[0] eq '14') {
#aliasname for category
push(@namespace_cat, $splitter[1]);
}
#save all aliases
$namespacealiases[$i][0] = $splitter[0];
$namespacealiases[$i][1] = $splitter[1];
#print 'Namespacealiases: '.$namespacealiases[$i][0].','.$namespacealiases[$i][1]."\n";
}
#foreach (@namespace_image) {
# print $_."\n";
#}
#print "\n";
#foreach (@namespace_cat) {
# print $_."\n";
#}
#magicwords
@magicword_defaultsort = get_magicword($metatext, 'defaultsort');
@magicword_img_thumbnail = get_magicword($metatext, 'img_thumbnail');
@magicword_img_manualthumb = get_magicword($metatext, 'img_manualthumb');
@magicword_img_right = get_magicword($metatext, 'img_right');
@magicword_img_left = get_magicword($metatext, 'img_left');
@magicword_img_none = get_magicword($metatext, 'img_none');
@magicword_img_center = get_magicword($metatext, 'img_center');
@magicword_img_framed = get_magicword($metatext, 'img_framed');
@magicword_img_frameless = get_magicword($metatext, 'img_frameless');
@magicword_img_page = get_magicword($metatext, 'img_page');
@magicword_img_upright = get_magicword($metatext, 'img_upright');
@magicword_img_border = get_magicword($metatext, 'img_border');
@magicword_img_sub = get_magicword($metatext, 'img_sub');
@magicword_img_super = get_magicword($metatext, 'img_super');
@magicword_img_link = get_magicword($metatext, 'img_link');
@magicword_img_alt = get_magicword($metatext, 'img_alt');
@magicword_img_width = get_magicword($metatext, 'img_width');
@magicword_img_baseline = get_magicword($metatext, 'img_baseline');
@magicword_img_top = get_magicword($metatext, 'img_top');
@magicword_img_text_top = get_magicword($metatext, 'img_text_top');
@magicword_img_middle = get_magicword($metatext, 'img_middle');
@magicword_img_bottom = get_magicword($metatext, 'img_bottom');
@magicword_img_text_bottom = get_magicword($metatext, 'img_text_bottom');
#foreach (@magicword_defaultsort) {
# print $_."\n";
#}
}
sub get_magicword {
my $metatext = $_[0];
my $key = $_[1];
my @result;
my $pos1 = index( $metatext, '', $part );
shift (@part_split);
foreach (@part_split) {
#print $_."\n"
my $pos3 = index ($_, '');
my $alias = substr ($_, 0, $pos3);
#print $alias ."\n";
push (@result, $alias );
}
return(@result);
}
}
sub get_next_page_from_dump{
#this function scan line after line from dump,
#the result is the text from the next article
my $line = ""; # one line in dump
my $article_complete = 0; # all line of article (then 1)
my $start_recording = 0; # find
my $revision_start = 0; # find
#loop for every line
do {
$line = ;
$line_number = $line_number +1;
#$number_of_scan_line = $number_of_scan_line +1; #Security, maybe the finish is not correct
#print "$line";
if ($line =~ //) {
$start_recording = 1;
}
if ($start_recording == 1) {
$text = $text.$line;
}
if ($line =~ /<\/page>/) {
$start_recording = 0;
$article_complete = 1;
}
if ($line =~ //) {
#extract title
$title ="$line";
my @content= split(/>/,$title);
@content= split(/,$content[1]);
$title=$content[0];
#print "$title\n";
}
if ($line =~ // and $page_id == -1 ) {
#extract id
$page_id ="$line";
my @content= split(/>/,$page_id);
@content= split(/,$content[1]);
$page_id = $content[0];
#print "$page_id\t$title\n";
}
if ($line =~ //) {
$revision_start = 1;
}
if ($revision_start == 1 and $revision_id == -1 and $line =~ //) {
#read revision_id
$revision_id ="$line";
my @content= split(/>/,$revision_id);
@content= split(/,$content[1]);
$revision_id=$content[0];
#print $revision_id,"\n";
}
if ($revision_start == 1 and $line =~ //) {
#read revision_id
$revision_time ="$line";
my @content= split(/>/,$revision_time);
@content= split(/,$content[1]);
$revision_time=$content[0];
#print $revision_time,"\n";
}
$end_of_dump = 'yes' if ($line =~ /<\/mediawiki>/);
$end_of_dump = 'yes' if (eof(DUMP) == 1);
}
until ( $article_complete == 1 or $end_of_dump eq 'yes');
#Extract only edit-text
my $test = index ($text, '');
$text = substr($text, $test);
$text =~ s///g;
$test = index($text, '');
$text = substr($text,0,$test);
$text = replace_special_letters($text);
#if ( $title eq 'At-Tabarī'
# or $title eq 'Rumänien'
# or $title eq 'Liste der Ortsteile im Saarland') {
# my $output_article_text_file = $output_directory.$project.'/'.$project.'_text_article_'.$title.'.txt';
# open(OUTPUT_ARTICLE_TEXT, ">$output_article_text_file");
# print OUTPUT_ARTICLE_TEXT $text;
# close(OUTPUT_ARTICLE_TEXT);
#}
#print $text;
}
sub get_next_page_from_live {
$current_live_article ++; #next article
if ( $current_live_error_scan != 0 ) {
# Error not 0 (new aricles, and last changes...)
if ($current_live_error_scan != 0 and $current_live_article == $maximum_current_error_scan) {
# set number higher if not all 50 errors found
#print 'Nr.'.$current_live_error_scan."\n";
#print 'Found at moment :'.$error_description[$current_live_error_scan][3]."\n";
#print 'Max allowed:'.$max_error_count."\n";
#print 'Max possible:'.$number_article_live_to_scan."\n";
if ( $error_description[$current_live_error_scan][3] < $max_error_count ) {
# set higer maximum
$maximum_current_error_scan = $maximum_current_error_scan + ($max_error_count - $error_description[$current_live_error_scan][3]);
#print 'Set higher maximum: '.$maximum_current_error_scan."\n";
} else {
# stop scan
save_errors_for_next_scan($current_live_article);
#$rest_of_errors_not_scan_yet
$current_live_article = -1;
}
}
# find next error with articles
if (($current_live_error_scan > 0 and $current_live_article == -1)
or $current_live_article == $number_article_live_to_scan
or $current_live_error_scan == -1) {
#print 'switch from error to error'."\n";
$current_live_error_scan = 0 if ($current_live_error_scan == -1); #start with error 1
do {
$current_live_error_scan ++;
#print $current_live_error_scan."\n";
@live_to_scan = ();
if ($error_description[$current_live_error_scan][3] < $max_error_count) {
# only if not all found with new/change/last
get_all_error_with_number($current_live_error_scan);
} else {
# if with new /change etc. we found for this error much
get_all_error_with_number($current_live_error_scan);
save_errors_for_next_scan(0);
@live_to_scan = ();
}
$number_article_live_to_scan = @live_to_scan;
} until ($current_live_error_scan >= $number_of_max_errors
or $number_article_live_to_scan > 0);
$maximum_current_error_scan = $max_error_count;
if ($error_description[$current_live_error_scan][3] > 0) {
#print 'More errors for error'.$current_live_error_scan."\n";
#print 'At moment only :'.$error_description[$current_live_error_scan][3]."\n";
$maximum_current_error_scan = $max_error_count - $error_description[$current_live_error_scan][3];
#print 'Search now for more :'.$maximum_current_error_scan."\n";
}
$current_live_article = 0;
$xml_text_from_api = '';
#print '#############################################################'."\n";
#print 'Error '.$current_live_error_scan.' :'."\t".$number_article_live_to_scan."\n" if ($number_article_live_to_scan > 0);
#print 'Max='.$maximum_current_error_scan."\n";
#print 'Available = '.$number_article_live_to_scan."\n";
}
}
if ( $current_live_error_scan == 0
and $current_live_article >= $number_article_live_to_scan ) {
# end of live, no more article to scan
$end_of_live = 'yes';
}
if ($current_live_error_scan >= $number_of_max_errors) {
# after check live all errors, then start with check of error 0 (new articles, last changes, ...)
$current_live_article = 0;
$xml_text_from_api = '';
$current_live_error_scan = 0;
get_all_error_with_number($current_live_error_scan);
$number_article_live_to_scan = @live_to_scan;
#print 'Error 0 :'."\t".$number_article_live_to_scan."\n";
$maximum_current_error_scan = $max_error_count;
}
#$number_article_live_to_scan = @live_to_scan;
if ( $current_live_article < $number_article_live_to_scan
and $number_article_live_to_scan > 0
and $end_of_live ne 'yes' ) {
# there is an error with articles
# now we get the next article
if ($xml_text_from_api eq '') {
# if list of xml_text_from_api is empty, then load next ariticles
#print 'Load next texts from API'."\n";
my $many_titles = '';
my $i = $current_live_article;
my $end_many_title = 'false';
do {
my $line = $live_to_scan[$i];
my @line_split = split( /\t/, $line);
my $next_title = $line_split[0];
print LOGFILE $next_title."\n";
$next_title = replace_special_letters($next_title);
$many_titles = $many_titles.'|'.uri_escape($next_title);
$many_titles =~ s/^\|//;
$i++;
$end_many_title = 'true' if ($i == $number_article_live_to_scan);
$end_many_title = 'true' if ($i == $current_live_article + 25); # not more then 25 articles
$end_many_title = 'true' if ( length($many_titles) > 2000); # url length not too long (Problem ruwiki and other no latin letters )
}
until ($end_many_title eq 'true');
#print 'Many titles ='.$many_titles."\n";
$xml_text_from_api = raw_text_more_articles( $many_titles );
$xml_text_from_api =~ s/^<\?xml version="1\.0"\?>//;
$xml_text_from_api =~ s/^//;
$xml_text_from_api =~ s/^//;
$xml_text_from_api =~ s/^//;
$xml_text_from_api =~ s/<\/api>$//;
$xml_text_from_api =~ s/<\/query>$//;
$xml_text_from_api =~ s/<\/pages>$//;
#print $xml_text_from_api."\n";
}
# get next title and text from xml_text_from_ap
if ($xml_text_from_api ne '') {
my $pos_end = index ($xml_text_from_api, '' );
if ($pos_end > -1 ) {
# normal page
$text = substr ( $xml_text_from_api, 0, $pos_end + length('') );
$xml_text_from_api = substr ( $xml_text_from_api, $pos_end + length('') );
} else {
# missing page
#
#print 'Missing Page'."\n";
$pos_end = index ($xml_text_from_api, 'missing="" />' );
$text = substr ( $xml_text_from_api, 0, $pos_end + length('missing="" />') );;
$xml_text_from_api = substr ( $xml_text_from_api, $pos_end + length('missing="" />') );
if ($pos_end == -1){
#BIG PROBLEM
print 'WARNING: Big problem with API'."\n";
print LOGFILE 'WARNING: Big problem with API'."\n";
$text = '';
$xml_text_from_api = '';
}
}
my $line = $live_to_scan[$current_live_article];
my @line_split = split( /\t/, $line);
$title = $line_split[0];
#print $title ."\n";
#print substr ( $text, 0, 150)."\n";
if (index ( $text, 'title='.'"'.$title.'"') == -1 ) {
# the result from the api is in a other sort
# know get the current title
# for example
#print "Old title:".$title ."\n";
my $pos_title = index ($text, 'title="');
my $title_text = $text;
$title_text = substr ( $title_text, $pos_title + length ('title="') );
$pos_title = index ($title_text, '"');
$title = substr ( $title_text, 0, $pos_title );
#print "New title:".$title;
#print "\n\n";
#print substr ( $text, 0, 150)."\n";
#print "\n\n";
}
#print $title."\n";
push(@article_was_scanned, $title);
# get id
my $test_id_pos = index ($text, 'pageid="');
if ($test_id_pos > -1) {
$page_id = substr($text, $test_id_pos + length( 'pageid="') );
$test_id_pos = index ($page_id , '"');
$page_id = substr($page_id, 0, $test_id_pos);
#print $page_id.' - '.$title."\n";
}
# get text
my $test = index ($text, '', $test );
$text = substr($text, $pos + 2);
#$text =~ s///g;
$test = index($text,'');
$text = substr($text,0,$test);
}
#revision_id
#revision_time
#print $text."\n";
#print substr($text, 0, 60)."\n";
$text = replace_special_letters($text);
}
}
}
sub save_errors_for_next_scan {
my $from_number = $_[0];
$number_article_live_to_scan = @live_to_scan;
for (my $i = $from_number; $i < $number_article_live_to_scan; $i++) {
#print $live_to_scan[$i]."\n";
my $line = $live_to_scan[$i];
#print '1:'.$line."\n";
my @line_split = split( /\t/, $line);
my $rest_title = $line_split[0];
$rest_of_errors_not_scan_yet = $rest_of_errors_not_scan_yet."\n".$rest_title."\t".$current_live_error_scan;
}
}
sub get_all_error_with_number {
# get from array "live_article" with all errors, only this errors with error number X
my $error_live = $_[0];
#print 'Error number: '.$error_live."\n";
my $number_of_article = @live_article;
for ($i = 0; $i < $number_of_article; $i ++) {
my $current_live_line = $live_article[$i];
#print $current_live_line."\n";
@line_split = split( /\t/, $current_live_line);
#print 'alle:'.$line_split[1]."\n" if ($error_live == 0);
my @split_error = split( ', ',$line_split[1]);
my $found = 'no';
foreach (@split_error) {
if ( $error_live eq $_ ){
#found error with number X
$found = 'yes';
#print $current_live_line."\n" if ($error_live == 0);
}
}
if ($found eq 'yes') {
# article has error X
#print 'found '.$current_live_line."\n" if ($error_live == 7);
# was this article scanned today ?
$found = 'no';
my $number_of_scanned_articles = @article_was_scanned;
#print 'Scanned: '."\t".$number_of_scanned_articles."\n";
foreach (@article_was_scanned) {
#print $_."\n";
if ( index ($current_live_line, $_."\t") == 0) {
#article was in this run scanned
$found = 'yes';
#print 'Was scanned :'."\t".$current_live_line."\n";
}
}
if ($found eq 'no') {
push(@live_to_scan, $current_live_line); #."\t".$i
}
}
}
}
sub get_all_error_with_type {
# at the moment not in use
# get from all error, only this errors with number X
my $error_type = $_[0];
my $number_of_article = @live_article;
for ($i = 0; $i < $number_of_article; $i ++) {
my $current_live_line = $live_article[$i];
@line_split = split( /\t/, $current_live_line);
if ( $line_split[1] eq $error_type) {
# $live_article[$i] =~ s/\tD\t/\tL\t/;
# $live_article[$i] =~ s/\tO\t/\tL\t/;
push(@live_to_scan, $current_live_line); #."\t".$i
}
}
}
sub replace_special_letters {
my $content = $_[0];
# only in dump must replace not in live
# http://de.wikipedia.org/w/index.php?title=Benutzer_Diskussion:Stefan_K%C3%BChn&oldid=48573921#Dump
$content =~ s/<//g;
$content =~ s/"/"/g;
$content =~ s/'/'/g;
$content =~ s/&/&/g;
# < -> <
# > -> >
# " -> "
# ' -> '
# & -> &
return ($content);
}
sub raw_text {
my $title = $_[0];
$title =~ s/&/%26/g; # Problem with & in title
$title =~ s/'/'/g; # Problem with apostroph in title
$title =~ s/<//g;
$title =~ s/"/"/g;
$title =~ s/'/'/g;
# http://localhost/~daniel/WikiSense/WikiProxy.php?wiki=$lang.wikipedia.org&title=$article
my $url2 = '';
#$url2 = 'http://localhost/~daniel/WikiSense/WikiProxy.php?wiki=de.wikipedia.org&title='.$title;
$url2 = $home;
$url2 =~ s/\/wiki\//\/w\//;
# old $url2 = $url2.'index.php?title='.$title.'&action=raw';
$url2 = $url2.'api.php?action=query&prop=revisions&titles='.$title.'&rvprop=timestamp|content&format=xml';
#print $url2."\n";
my $response2 ;
#do {
uri_escape($url2);
#print $url2."\n";
#uri_escape( join ' ' => @ARGV );
my $ua2 = LWP::UserAgent->new;
$response2 = $ua2->get( $url2 );
#}
#until ($response2->is_success);
my $content2 = $response2->content;
my $result2 = '';
$result2 = $content2 if ($content2) ;
return($result2);
}
sub raw_text2 {
my $url = $_[0];
$url =~ s/&/%26/g; # Problem with & in title
$url =~ s/'/'/g; # Problem with apostroph in title
my $response2 ;
uri_escape($url);
my $ua2 = LWP::UserAgent->new;
$response2 = $ua2->get( $url );
my $content2 = $response2->content;
my $result2 = '';
$result2 = $content2 if ($content2) ;
return($result2);
}
sub raw_text_more_articles {
my $title = $_[0];
#$title =~ s/&/%26/g; # Problem with & in title
#$title =~ s/'/'/g; # Problem with apostroph in title
#$title =~ s/<//g;
#$title =~ s/"/"/g;
#$title =~ s/'/'/g;
my $url2 = '';
$url2 = $home;
$url2 =~ s/\/wiki\//\/w\//;
$url2 = $url2.'api.php?action=query&prop=revisions&titles='.$title.'&rvprop=timestamp|content&format=xml';
print LOGFILE $url2."\n";
my $response2 ;
my $ua2 = LWP::UserAgent->new;
$response2 = $ua2->get( $url2 );
my $content2 = $response2->content;
my $result2 = '';
$result2 = $content2 if ($content2) ;
return($result2);
}
####################################
sub text_translation_input{
print 'Load tanslation of:'."\t".$project."\n" if ($silent_modus ne 'silent');
# Input of translation page
$translation_page = 'Wikipedia:WikiProject Check Wikipedia/Translation' if ($project eq 'afwiki') ;
$translation_page = 'ويكيبيديا:فحص_ويكيبيديا/ترجمة' if ($project eq 'arwiki') ;
$translation_page = 'Viquipèdia:WikiProject Check Wikipedia/Translation' if ($project eq 'cawiki') ;
$translation_page = 'Wikipedie:WikiProjekt Check Wikipedia/Translation' if ($project eq 'cswiki') ;
$translation_page = 'Commons:WikiProject Check Wikipedia/Translation' if ($project eq 'commonswiki') ;
$translation_page = 'Wicipedia:WikiProject Check Wikipedia/Translation' if ($project eq 'cywiki') ;
$translation_page = 'Wikipedia:WikiProjekt Check Wikipedia/Oversættelse' if ($project eq 'dawiki') ;
$translation_page = 'Wikipedia:WikiProjekt Syntaxkorrektur/Übersetzung' if ($project eq 'dewiki') ;
$translation_page = 'Wikipedia:WikiProject Check Wikipedia/Translation' if ($project eq 'enwiki') ;
$translation_page = 'Projekto:Check Wikipedia/Tradukado' if ($project eq 'eowiki') ;
$translation_page = 'Wikiproyecto:Check Wikipedia/Translation' if ($project eq 'eswiki') ;
$translation_page = 'Wikipedia:Wikiprojekti Check Wikipedia/Translation' if ($project eq 'fiwiki') ;
$translation_page = 'Projet:Correction syntaxique/Traduction' if ($project eq 'frwiki') ;
$translation_page = 'Meidogger:Stefan Kühn/WikiProject Check Wikipedia/Translation' if ($project eq 'fywiki') ;
$translation_page = 'Wikipedia:WikiProject Check Wikipedia/Translation' if ($project eq 'hewiki') ;
$translation_page = 'Wikipédia:Ellenőrzőműhely/Fordítás' if ($project eq 'huwiki') ;
$translation_page = 'Wikipedia:ProyekWiki Cek Wikipedia/Terjemahan' if ($project eq 'idwiki') ;
$translation_page = 'Wikipedia:WikiProject Check Wikipedia/Translation' if ($project eq 'iswiki') ;
$translation_page = 'Wikipedia:WikiProjekt Check Wikipedia/Translation' if ($project eq 'itwiki') ;
$translation_page = 'Wikipedia:ウィキプロジェクト ウィキ文法のチェック/Translation' if ($project eq 'jawiki') ;
$translation_page = 'Vicipaedia:WikiProject Check Wikipedia/Translation' if ($project eq 'lawiki') ;
$translation_page = 'Wikipedia:Wikiproject Check Wikipedia/Translation' if ($project eq 'ndswiki') ;
$translation_page = 'Wikipedie:WikiProject Check Wikipedia/Translation' if ($project eq 'nds_nlwiki') ;
$translation_page = 'Wikipedia:Wikiproject/Check Wikipedia/Vertaling' if ($project eq 'nlwiki') ;
$translation_page = 'Wikipedia:WikiProject Check Wikipedia/Translation' if ($project eq 'nowiki') ;
$translation_page = 'Wikipedia:WikiProject Check Wikipedia/Translation' if ($project eq 'pdcwiki') ;
$translation_page = 'Wikiprojekt:Check Wikipedia/Tłumaczenie' if ($project eq 'plwiki') ;
$translation_page = 'Wikipedia:Projetos/Check Wikipedia/Tradução' if ($project eq 'ptwiki') ;
$translation_page = 'Википедия:Страницы с ошибками в викитексте/Перевод' if ($project eq 'ruwiki') ;
$translation_page = 'Wikipedia:WikiProject Check Wikipedia/Translation' if ($project eq 'rowiki') ;
$translation_page = 'Wikipédia:WikiProjekt Check Wikipedia/Translation' if ($project eq 'skwiki') ;
$translation_page = 'Wikipedia:Projekt wikifiering/Syntaxfel/Translation' if ($project eq 'svwiki') ;
$translation_page = 'Vikipedi:Vikipedi proje kontrolü/Çeviri' if ($project eq 'trwiki') ;
$translation_page = 'Вікіпедія:Проект:Check Wikipedia/Translation' if ($project eq 'ukwiki') ;
$translation_page = 'װיקיפּעדיע:קאנטראלירן_בלעטער/Translation' if ($project eq 'yiwiki') ;
$translation_page = '维基百科:专题/错误检查/翻译' if ($project eq 'zhwiki') ;
my $translation_input = raw_text($translation_page);
$translation_input = replace_special_letters($translation_input);
#print $translation_input."\n";
my $input_text ='';
# start_text
$input_text = get_translation_text($translation_input, 'start_text_'.$project.'=', 'END');
$start_text = $input_text if ($input_text ne '');
# description_text
$input_text = get_translation_text($translation_input, 'description_text_'.$project.'=', 'END');
$description_text = $input_text if ($input_text ne '');
# category_text
$input_text = get_translation_text($translation_input, 'category_001=', 'END' );
$category_text = $input_text if ($input_text ne '');
# priority
$input_text = get_translation_text($translation_input, 'top_priority_'.$project.'=', 'END' );
$top_priority_project = $input_text if ($input_text ne '');
$input_text = get_translation_text($translation_input, 'middle_priority_'.$project.'=', 'END' );
$middle_priority_project = $input_text if ($input_text ne '');
$input_text = get_translation_text($translation_input, 'lowest_priority_'.$project.'=', 'END' );
$lowest_priority_project = $input_text if ($input_text ne '');
# find error description
for (my $i = 0; $i < $number_of_max_errors; $i++) {
my $current_error_number = 'error_';
$current_error_number = $current_error_number.'0' if ($i < 10);
$current_error_number = $current_error_number.'0' if ($i < 100);
$current_error_number = $current_error_number.$i;
#print $i, $current_error_number."\n";
# Priority
$error_description[$i][4] = get_translation_text($translation_input, $current_error_number.'_prio_'.$project.'=', 'END');
#print "x".$error_description[$i][4]."x"."\n";
if ($error_description[$i][4] ne '') {
$error_description[$i][4] = int ($error_description[$i][4]);
} else {
$error_description[$i][4] = -1;
}
#print $error_description[$i][4]."\n";
$error_description[$i][5] = get_translation_text($translation_input, $current_error_number.'_head_'.$project.'=', 'END');
$error_description[$i][6] = get_translation_text($translation_input, $current_error_number.'_desc_'.$project.'=', 'END');
}
}
sub get_translation_text {
my $translation_text = $_[0];
my $start_tag = $_[1];
my $end_tag =$_[2];
my $pos_1 = index($translation_text, $start_tag);
my $pos_2 = index($translation_text, $end_tag, $pos_1);
my $result = '';
if ($pos_1 > -1 and $pos_2 > 0) {
$result = substr($translation_text, $pos_1, $pos_2 -$pos_1);
#print $result."\n";
$result = substr($result, index ($result, '=')+1);
$result =~ s/^ //g;
$result =~ s/ $//g;
}
return ($result);
}
sub text_translation_output{
# Output of translation-file
my $filename = $output_directory.$project.'/'.$project.'_'.$translation_file;
print 'Output translation:'."\t".$project.'_'.$translation_file."\n" if ($silent_modus ne 'silent');
open(TRANSLATION, ">$filename");
#######################################
print TRANSLATION '