#!/usr/bin/perl -w ########################## # Transform Personendaten ########################## # Parst Datumsangaben und erweitert die Datumsfelder um je 8 Felder # Sowie Artikel bei Orten und Vor/Nanchname ########################## my %monate=( "Januar" => 1, "Februar" => 2, "März" => 3, "April" => 4, "Mai" => 5, "Juni" => 6, "Juli" => 7, "August" => 8, "September" => 9, "Oktober" => 10, "November" => 11, "Dezember" => 12 ); my %monatsnamen = (); foreach $monat (keys %monate) { # inverse $monatsnamen{$monate{$monat}} = $monat; } my $MONAT = "(" . join("|",keys %monate) . ")"; my $JAHR = "(\\d{1,4})"; my $ZUSATZ = "([^0-9]+)"; # Bereinigt eine Datumsangabe für den Datumsstandard der Personendaten sub clean_date { my $d = shift; # Eckige Klammern entfernen $d =~ s/[\[\]]//g; # DD.MM.YYYY => DD. MMM YYYY if ($d =~ /^0?(\d{1,2})\.(\d{1,2})\.(\d{4})$/) { $monat = int $2; $d = "$1. ".$monatsnamen{$monat}." $3"; } # Vergessene Leerzeichen hinzufügen $d =~ s/([a-z])(\d)/$1 $2/g; $d =~ s/(\d)([a-z])/$1 $2/g; $d =~ s/\.([^ ])/. $1/g; # Doppelte Leerzeichen entfernen $d =~ s/\s+/ /g; # Leerzeichen am Anfang/Ende entfernen $d =~ s/^\s+//; $d =~ s/\s+$//; # Nur Fragezeichen => "unbekannt" $d =~ s/^\?$/unbekannt/; # n. Chr. ist implizit (unnötig) $d =~ s/n\. ?Chr\.//; # TODO: funtz nicht ganz?? z.B. bei [[Libanios]] # Fragezeichen immer klammern und Leerzeichen davor $d =~ s/\(\?\)/\?/g; $d =~ s/([^ ])\?+/$1 \?/g; $d =~ s/\?/(\?)/g; # Statt Fragezeichen am Ende "vermutlich" am Anfang $d =~ s/^(.+) \(\?\)$/vermutlich $1/; $d =~ s/^\(\?\)$/vermutlich/; # Slash ohne Leerzeichen $d =~ s/ ?\/ ?/\//g; # Jänner => Januar $d =~ s/Jänner/Januar/; # Abgekürzte Monatsname $d =~ s/Jan\./Januar/; $d =~ s/Feb\./Februar/; $d =~ s/Aug\./August/; $d =~ s/Okt\./Oktober/; $d =~ s/Nov\./November/; $d =~ s/Dez\./Dezember/; # "Jahrhundert" ausschreiben $d =~ s/Jh\./Jahrhundert/g; # TOOD # "Ende des 5. Jahrhunderts" => "Ende 5. Jahrhundert" (einfacher) $d =~ s/Jahrhunderts/Jahrhundert/; $d =~ s/ des//; $d =~ s/(ungefähr|circa|ca\.?|etwa|gegen|~)/um/; $d =~ s/(wohl|wahrscheinlich)/vermutlich/g; # Doppelte Leerzeichen entfernen $d =~ s/\s+/ /g; #----------Fehlerbereinigung--------- # Sonstige Tippfehler $d =~ s/^Um/um/; $d =~ s/chr/Chr/; return $d; } sub ilog10 { $x=shift; return int log($x) / log(10); } my @fields = ( "day", "month", "year", "century", "decade", "year1", "year2", "note" ); sub parse_date { $date = shift; local %d = (); if (! $date ) { $d{"note"} = ""; # Normale Datumsangabe } elsif ($date =~ /^$ZUSATZ?(\d+\. )?$MONAT $JAHR( v\. Chr\.)?$/) { $d{"note"} = $1 if defined $1; $d{"day"} = substr($2,0,-2) if defined $2; if (defined $3) { $d{"month"} = $monate{$3}; $d{"month"} = 3 if $3 =~ /M.rz/; # Sonderzeichen } $d{"year"} = int $4; $d{"year"} = -$d{"year"} if defined $5; # v. Chr. } elsif ($date =~ /^$ZUSATZ?$JAHR( v\. Chr\.)?$/) { $d{"note"} = trim($1) if defined $1; $d{"year"} = int $2; $d{"year"} = -$d{"year"} if defined $3; # v. Chr. # Jahrhundert #} elsif ($date =~ /^(um |vermutlich |Anfang |Mitte |Ende )?(\d{1,2})\. Jahrhundert( v\. Chr\.)?$/) { } elsif ($date =~ /^$ZUSATZ?(\d{1,2})\. Jahrhundert( v\. Chr\.)?$/) { $d{"note"} = trim($1) if defined $1; $d{"century"} = $2; $d{"century"} = -$d{"century"} if defined $3; # Jahrhzehnt } elsif ($date =~ /^(\d{1,4}) ?er( v\. Chr.)?$/) { $d{"decade"} = (int $1 / 10) * 10; $d{"decade"} = -$d{"decade"} if defined $3; $d{"century"} = int ($d{"decade"} / 10) + 1; } elsif ($date =~ /^$ZUSATZ?$JAHR oder $JAHR$/) { # TODO: oder != bis, TODO: v. Chr $d{"note"} = trim($1) if defined $1; $d{"year1"} = int $2; $d{"year2"} = int $3; } elsif ($date =~ /^$ZUSATZ?$JAHR\/(\d{1,4})$/) { # TODO: v. Chr $d{"note"} = trim($1) if defined $1; $d{"year1"} = int $2; $d{"year2"} = int $3; # Beispiel: 1632/33 => 1632/1633 $c = ilog10($d{"year1"}) - ilog10($d{"year2"}); if ( $c < 0 ) { # Fehler $d{"year1"} = ""; $d{"year2"} = ""; $d{"note"} = $date; } else { $d{"year2"} = substr($d{"year1"},0,$c) . $d{"year2"}; } # Zeitraum von mehreren Jahren # TODO: nicht getestet! } elsif ( $date =~ /^$ZUSATZ?zwischen $JAHR( und )$JAHR( v\. Chr\.)?$/ or $date =~ /^$ZUSATZ?$JAHR( bis |-)$JAHR$( v\. Chr\.)?/ ) { $d{"note"} = trim($1) if defined $1; $d{"year1"} = int $2; $d{"year2"} = int $4; if (defined $5) { $tmp = $d{"year1"}; $d{"year1"} = -$d{"year2"}; $d{"year2"} = -$tmp; } #} elsif ($date =~ /^$ZUSATZ?$JAHR( bis |-)$JAHR$/) { # $d{"note"} = trim($1) if defined $1; # $d{"year1"} = int $2; # $d{"year2"} = int $4; } else { $d{"note"} = $date; } if (defined $d{"year1"} and defined $d{"year2"}) { # ggf. Jahre vertauschen #if ($d{"year1"} > $d{"year2"}) { # ($d{"year1"}, $d{"year2"}) = ($d{"year2"}, $d{"year1"}); #} # ggf. Jahrhundert bestimmen if ( (int $d{"year1"} / 100) eq (int $d{"year2"} / 100)) { $d{"century"} = (int $d{"year1"} / 100) + 1; } # ggf. Jahrzehnt bestimmen if ( $d{"century"} and (int $d{"year1"} / 10) eq (int $d{"year2"} / 10)) { $d{"decade"} = (int $d{"year1"} / 10) * 10; } } # Jahrzehnt und Jahrhundert ableiten if (defined $d{"year"}) { $d{"decade"} = (int $d{"year"} / 10) * 10; } # Jahrhundert ableiten if (defined $d{"decade"}) { $d{"century"} = (int $d{"decade"} / 100) + 1; } # Nicht vorhandene Felder setzen foreach $f (@fields) { $d{$f} = '' if not defined $d{$f}; } return %d; } sub parse_location { $p = trim(shift); # [[a]] oder [[b]] <- sowas gibt's! # [[...]] # [[...]], ... # [[...]]/ ... <- so besser nicht if ($p =~ /^\[\[([^\]]+)\]\](,.+)?$/) { $a = $1; $a =~ s/\|.*$//; #print "$p|$a\n"; return $a; } else { #print "!$p\n"; return ""; } } sub parse_pnd { $p = trim(shift); $pnd_nr = ""; $pnd_date = ""; if ($p =~ /^([0-9])([0-9])([0-9])([0-9])([0-9])([0-9])([0-9])([0-9])([0-9X])/) { $check = $9; $check = 10 if($check eq "X"); # pruefziffer if ( ((2*$1+3*$2+4*$3+5*$4+6*$5+7*$6+8*$7+9*$8) % 11) eq $check ) { # TODO: Nummernbereich 10000000 bis 14999999 if ($1 == "1" && $2>=0 && $2<=4) { $pnd_nr = "$1$2$3$4$5$6$7$8$9"; } } } if ($p =~ /(\d){1,2}\.(\d{1,2})\.(\d\d\d\d)/) { $pnd_date = "$3-$2-$1"; } return ($pnd_nr, $pnd_date); } sub unbracket { $p = shift; # Eckige Klammern entfernen $p =~ s/[\[\]]//g; return $p; } sub trim { $p = shift; if ($p) { # Leerzeichen am Anfang/Ende entfernen $p =~ s/^\s+//; $p =~ s/\s+$//; } return $p; } ########################## my @pd = (); while(<>) { $line = $_; @pd = split("\t",$line); chop($pd[-1]); # remove end-of-line $pd_transformed[0] = trim($pd[0]); # pd_id $pd_transformed[1] = trim($pd[1]); # pd_article $pd_transformed[2] = trim($pd[2]); # pd_name $pd_transformed[3] = trim($pd[3]); # pd_alternative $pd_transformed[4] = trim($pd[4]); # pd_description $pd_transformed[5] = unbracket(trim($pd[5])); # pd_born $pd_transformed[6] = trim($pd[6]); # pd_born_in $pd_transformed[7] = unbracket(trim($pd[7])); # pd_died $pd_transformed[8] = trim($pd[8]); # pd_died_in $pd_transformed[9] = trim($pd[9]); # pd_pnd # extract checked pnd-nr and additional date ($pd_transformed[10], $pd_transformed[11]) = parse_pnd($pd[9]); # pnr_nr, pnd_date if ( trim($pd[2]) =~ /^([^,]+),([^,]+)$/ ) { $pd_transformed[12] = trim($2); # n_given $pd_transformed[13] = trim($1); # n_surname } else { $pd_transformed[12] = ''; # n_given $pd_transformed[13] = ''; # n_surname } $pd_transformed[14] = parse_location($pd[6]); # b_place $pd_transformed[15] = parse_location($pd[8]); # d_place %born = parse_date(clean_date($pd[5])); $pd_transformed[16] = $born{"day"}; # b_day $pd_transformed[17] = $born{"month"}; # b_month $pd_transformed[18] = $born{"year"}; # b_year $pd_transformed[19] = $born{"decade"}; # b_decade $pd_transformed[20] = $born{"century"}; # b_century $pd_transformed[21] = $born{"year1"}; # b_year1 $pd_transformed[22] = $born{"year2"}; # b_year2 $pd_transformed[23] = $born{"note"}; # b_note %died = parse_date(clean_date($pd[7])); $pd_transformed[24] = $died{"day"}; # d_day $pd_transformed[25] = $died{"month"}; # d_month $pd_transformed[26] = $died{"year"}; # d_year $pd_transformed[27] = $died{"decade"}; # d_decade $pd_transformed[28] = $died{"century"}; # d_century $pd_transformed[29] = $died{"year1"}; # d_year1 $pd_transformed[30] = $died{"year2"}; # d_year2 $pd_transformed[31] = $died{"note"}; # d_note print join("\t",@pd_transformed) . "\n"; }