<?php
require_once("WikiSense.php");
require_once("WikiAnalyzer.php");
define('PART_DEFINITION','definition');
class WikiAnalyzer_wikipedia extends WikiAnalyzer {
var $corpus;
var $disambigFirstLink;
function WikiAnalyzer_wikipedia(&$wiki) {
WikiAnalyzer::WikiAnalyzer($wiki);
$this->corpus= NULL;
$this->disambigFirstLink= true;
}
function analyzeLinks($text, &$rc, &$concept, &$dict, $conf = 1) {
wsfLog("*** analyzeLinks()",LL_DEBUG);
$links= $this->wiki->extractLinks($text);
foreach ( $links as $e ) {
if (!$e) continue;
extract( $e );
if ( $link === NULL || $link === '') {
wsfLog( "skipping empty link!", LL_NOISY );
continue;
}
if ( $link == $concept['name'] && !$lang) {
wsfLog( "skipping self-link!", LL_NOISY );
continue;
}
if ($lang) { #interwiki
if (!$escaped) {
$dict->putConceptToExternalConceptRel( 'similar_trans', $concept, $lang, $link, $rc, $conf * 0.6 );
}
else wsfLog("skipping escaped interlanguage-link to $lang: $link",LL_DEBUG);
}
else if ( $namespace ) {
if (!$escaped) {
if ( $namespace === "category" || $this->wiki->getNsIndex($namespace) === NS_CATEGORY ) {
if ($link!=$concept['name']) {
$dict->putConceptToConceptRel( 'cat', $concept, $link, $rc, $conf * 0.9 );
}
if ($label && preg_match('/\{\{.*\}\}/',$label)) $label= NULL;
if ($label && $label!=$concept['name'] && $label!=$link) {
if (strlen($label)>2) {
$label= $this->normalizeTitle($label);
$label= $label['normal'];
$label= preg_replace('/^[ !\*\$%_\-] */','',$label);
if ($label!=='') $dict->putTermToConceptRel( 'meaning', $label, $concept, $rc, $conf * 0.6 );
}
else if ($label===' ' || $label==='!' || $label==='*' ||
$label==='$' || $label==='%' || $label==='_' || $label==='-') {
$dict->putConceptToConceptRel( 'similar', $concept, $link, $rc, $conf * 0.4 );
}
}
}
else wsfLog("skipping link to namespace $namespace: $link",LL_DEBUG);
}
else wsfLog("skipping escaped link to namespace $namespace: $link",LL_DEBUG);
}
else {
$this->putLink($label, $e, $rc, $concept, $dict, true, $conf);
}
}
}
function putLink($term, &$e, &$rc, &$concept, &$dict, $putConceptLink, $conf = 1) {
if ($this->skipLink( $e )) return false;
extract($e,EXTR_SKIP);
if ($section && !$link) $link= @$concept['term']; #internal section link
if ($term) $label= $term; #overwrite label, for disambig and redirs.
else if ($label === NULL || $label==='') $label= $link; #for plain links
if (is_string($label)) {
$label= $this->normalizeTitle($label);
$label= $label['normal'];
}
#print "PUT LINK: {$concept['term']} =>($label)=> $link\n";
if (!$lang && !$namespace && $link) {
if ($section) {
if ($putConceptLink) $dict->putConceptToConceptRel( 'links', $concept, $link, $rc, $conf * 0.2 );
$dict->putTermToConceptRel( 'meaning', $label, $link, $rc, $conf * 0.2 ); #THINK: omit?!
#print_r($e);
$full= $link.' # '.$section;
if ($putConceptLink) $dict->putConceptToConceptRel( 'links', $concept, $full, $rc, $conf * 0.9 );
$dict->putTermToConceptRel( 'meaning', $label, $full, $rc, $conf * 0.9 );
$dict->putTermToConceptRel( 'meaning', $section, $full, $rc, $conf * 0.9 );
$dict->putConceptToConceptRel( 'cat', $full, $link, $rc, $conf * 0.9 ); #TODO: only once
$dict->putConceptToConceptRel( 'cat', $full, $section, $rc, $conf * 0.6 ); #TODO: only once
$sectrc= $dict->putResource( $this->corpus, "$link#$section", false, RESOURCE_SECTION ); #FIXME: timestamp!!
$dict->putResourceToConceptRel( 'seepage', $sectrc, $full );
#$dict->putResourceToResourceRel( 'section', $rc, $link );
#TODO #TODO #TODO #TODO #TODO #TODO #TODO #TODO #TODO #TODO #TODO
#FIXME: relate resources, update sections on resource-update!
}
else {
if ($putConceptLink) $dict->putConceptToConceptRel( 'links', $concept, $link, $rc, $conf * 0.9 );
$dict->putTermToConceptRel( 'meaning', $label, $link, $rc, $conf * 0.9 );
}
return true;
}
else {
wsfLog( "skipping link: $lang:$namespace:$link!", LL_NOISY );
return false;
}
}
function analyzeDisambiguation( $text, &$rc, &$concept, &$term, &$dict, $conf = 1 ) {
#FIXME: handle "this article..."
wsfLog("*** analyzeDisambiguation()",LL_VERBOSE);
if (is_array($term)) $trm= $this->wiki->lc($term['term']);
else $trm= $this->wiki->lc($term);
$t= str_replace("\r","\n",$text);
$t= explode("\n", $t);
foreach ( $t as $s ) {
$s= trim($s);
if (empty($s)) continue;
$m= array();
if ( preg_match('/^:*[#*]+ *(.+) *$/',$s,$m) ) {
$s= $m[1];
#print "==> $s\n";
$links= $this->wiki->extractLinks($s);
if ( !$links ) continue;
$l= NULL;
$c= 0;
#print_r($links);
if (sizeof($links)===1) {
$lnk= $links[0];
if ( preg_match('/^\[\[[^\]]+\]\]$/',$s) ) {
#just a link - that's it!
if (strpos($this->wiki->lc($lnk['link']),$trm)!==false) {
$l= $lnk;
$c= 0.9;
#print "* MATCH: ".$this->wiki->lc($lnk['link'])." >= $trm\n";
}
else {
$l= $lnk;
$c= 0.7;
}
}
}
if (!$l || $c) {
#look for simmilar title first
foreach ($links as $lnk) {
if (strpos($this->wiki->lc($lnk['link']),$trm)!==false) {
$l= $lnk;
$c= 0.8;
break;
}
}
if (!$l) {
if ($this->disambigFirstLink) $l= $links[0];
else $l= $links[sizeof($links)-1];
$c= 0.4;
}
}
if ($l && $c) {
$this->putLink($term, $l, $rc, $concept, $dict, false, $conf);
}
}
}
}
function analyze($article, &$dict, $conf = 1) {
if (is_string($article)) $article= $this->wiki->loadInfo($article);
if ($article['namespace']) {
wsfLog( "skipping page in namespace {$article['namespace']}!", LL_VERBOSE );
return false;
}
$uri= $article['url'];
$title= $article['title'];
$text= $article['text'];
$timestamp= $article['timestamp'];
if ($title===NULL || $title===false || $title==='') {
wsfLog( "empty page title!", LL_WARN );
return false;
}
if ($text===NULL || $text===false || $text==='') {
wsfLog( "empty page: $title", LL_WARN );
return false;
}
if (!$this->corpus) {
$this->corpus= $dict->putCorpus( $this->wiki->domain );
}
$rctype= $this->getResourceType($title, $text);
$rc= $dict->putResource( $this->corpus, $title, $timestamp, $rctype );
if ($rctype == RESOURCE_CONCEPT) {
$contype= $this->getConceptType($title, $text);
}
else {
$contype= CONCEPT_NONE;
}
$concept= $dict->putConcept( $title, $contype);
if (!$concept) trigger_error("failed to aquire concept: $title", E_USER_WARNING);
$ok= $dict->putResourceToConceptRel( 'seepage', $rc, $concept );
if ($ok===false) trigger_error("failed to put seepage relation for: $title", E_USER_WARNING);
$a= $this->normalizeTitle($title);
$cat= $a['cat'];
$term= $a['normal'];
$concept['term']= $term;
#wsfLog("-- TITLE: $term -> $title",LL_DEBUG);
$dict->putTermToConceptRel( 'meaning', $term, $concept, $rc, $conf * 0.9 );
if ($cat) $dict->putConceptToConceptRel( 'cat', $concept, $cat, $rc, $conf * 0.9 );
if ($rctype == RESOURCE_CONCEPT) {
$def= $this->getFirstSentence($text);
if ($def) {
$def= $dict->putText( PART_DEFINITION, $def, $rc );
$dict->putConceptToTextRel( 'definition', $concept, $def, $rc, $conf * 0.9 );
}
$this->analyzeLinks($text, $rc, $concept, $dict);
}
else if ($rctype == RESOURCE_REDIRECT) {
$match= array();
if ( preg_match( REDIRECT_PATTERN, $text, $match ) ) {
$tgt= $match[1];
$e= $this->wiki->makeLinkEntry($tgt);
if (!$e['lang'] && !$e['namespace'] && $e['link']) {
$dict->putConceptToConceptRel( 'similar', $concept, $e['link'], $rc, $conf * 0.6 );
$this->putLink($term, $e, $rc, $concept, $dict, false, $conf);
}
else wsfLog( "skipping namespace/language redirect to: {$e['link']}!", LL_NOISY );
}
}
else if ($rctype == RESOURCE_LIST) {
#TODO
$this->analyzeLinks($text, $rc, $concept, $dict, $conf);
}
else if ($rctype == RESOURCE_DISAMBIG) {
$this->analyzeLinks($text, $rc, $concept, $dict, $conf);
$this->analyzeDisambiguation( $text, $rc, $concept, $term, $dict, $conf = 1 );
}
else {
wsfLog( "skipping resource with type $rctype!", LL_INFO );
}
}
function normalizeTitle( $title ) {
$cat= NULL;
$m= array();
if (preg_match($this->titleSuffixPattern,$title,$m)) {
$cat= $m[2];
$title= $m[1];
}
return array(
'normal' => $title,
'cat' => $cat,
);
}
/*
function analyze($title, $text, &$wight) {
$rel= array();
$type= TYPE_CONCEPT;
$norm= $this->normalizeTitle($title);
if (preg_match($this->listTitlePattern,$title)) $type= TYPE_LIST;
if ($norm) {
$rel[]= array(
'from_language' => $this->language,
'from_term' => $normal,
'rel' => REL_NORMAL,
'to_language' => $this->language,
'to_term' => $title,
'weight' => 0.99 * $weight,
);
$rel[]= array(
'from_language' => $this->language,
'from_term' => $title,
'rel' => REL_CAT,
'to_language' => $this->language,
'to_term' => $norm['cat'],
'weight' => 0.99 * $weight,
);
}
#TODO: also create transcriptions!
$match= array();
if ( preg_match( REDIRECT_PATTERN, $text, $match ) ) {
$tgt= $match[1];
$e= $this->makeLinkEntry($tgt);
if (!$e['namespace'] && !$e['language']) {
$rel[]= array(
'from_language' => $this->language,
'from_term' => $title,
'rel' => REL_HYPERSYN,
'to_language' => $this->language,
'to_term' => $e['link'],
'weight' => 0.988 * $weight,
);
if ($e['section']) {
$rel[]= array(
'from_language' => $this->language,
'from_term' => $title,
'rel' => REL_HYPERSYN,
'to_language' => $this->language,
'to_term' => $e['section'],
'to_type' => TYPE_SECTION,
'weight' => 0.88 * $weight,
);
}
}
return array (
'size' => strlen($text),
'type' => TYPE_REDIRECT,
'relations' => $rel
);
}
$links= $this->extractLinks($text);
foreach ( $links as $e ) {
if (!$e) continue;
extract( $e );
if ( $link === NULL || $link === '') {
wsfLog( "skipping empty link!", LL_NOISY );
continue;
}
if ($this->skipLink( $link )) continue;
$lnkRel= REL_LINK; //all links...
$lnkLang= $this->language;
if ( $lang ) {
$lnkLang= $lang;
if (!$escaped) $lnkRel= REL_TRANS;
}
else if ( $namespace ) {
if ( $namespace == "category" || $this->getNsIndex($namespace) === NS_CATEGORY ) {
$lnkRel= REL_CAT;
//sort key as alias
if ($label && $label!=$title && strlen($label > 1)) {
$rel[]= array(
'from_language' => $this->language,
'from_term' => $label,
'rel' => REL_LABEL,
'to_language' => $this->language,
'to_term' => $title,
'weight' => 0.92 * $weight,
);
$label= NULL; //sort-keys are not labels!
}
}
else {
continue; //NOTE: don't process label either!
}
}
if ( $lnkRel ) {
$rel[]= array(
'from_language' => $this->language,
'from_term' => $title,
'rel' => $lnkRel,
'to_language' => $lnkLang,
'to_term' => $link,
'weight' => 0.99 * $weight,
);
}
#TODO: be smart...
#if ( $section && $lnkRel == REL_LINK) {
# $rel[]= array(
# 'from_language' => $this->language,
# 'from_term' => $title,
# 'rel' => REL_LINK,
# 'to_language' => $lang,
# 'to_term' => $section,
# 'to_type' => TYPE_SECTION,
# 'weight' => $weight,
# );
#}
if ( $label ) {
$rel[]= array(
'from_language' => $this->language,
'from_term' => $label,
'rel' => REL_LABEL,
'to_language' => $lnkLang,
'to_term' => $link,
'weight' => $weight * 0.8,
);
}
}
$templates= array();
preg_match_all( TEMPLATE_PATTERN, $text, $templates, PREG_PATTERN_ORDER );
$templates= $templates[1];
foreach ( $templates as $tmpl ) {
$e= $this->makeLinkEntry( $tmpl );
$rel[]= array(
'from_language' => $this->language,
'from_term' => $title,
'rel' => REL_TAG,
'to_language' => $this->language,
'to_term' => $e['link'], #NOTE: missing namespace?...
'weight' => 0.988 * $weight,
);
}
$disamb= $this->extractDisambiguationRelations($text);
if ( $disamb ) {
#print ">>> DISAMBIG: $title\n";
#print_r($disamb);
foreach ( $disamb as $s ) {
$rel[]= array(
'from_language' => $this->language,
'from_term' => $title,
'rel' => REL_HOM,
'to_language' => $this->language,
'to_term' => "$s",
'weight' => $weight * 0.2,
);
}
$type= TYPE_DISAMBIG;
}
if (preg_match($this->badPagePattern,$text)) $type= TYPE_BAD;
return array (
'size' => strlen($text),
'type' => $type,
'relations' => $rel
);
}
*/
}
?>
WikiAnalyzer_wikipedia.php
application/x-php, 19856 bytes (load raw)

