@ -36,7 +36,8 @@ import java.util.regex.Pattern;
import net.yacy.cora.storage.Files ;
import net.yacy.document.WordCache.Dictionary ;
import net.yacy.document.geolocalization.Locations ;
import net.yacy.document.geolocation.GeoLocation ;
import net.yacy.document.geolocation.Locations ;
public class Tagging {
@ -52,6 +53,39 @@ public class Tagging {
private String predicate , namespace , objectspace ;
/ * *
* helper class : Synonym and Objectlink tuple
* /
public static class SOTuple {
private final String synonyms ;
private final String objectlink ;
public SOTuple ( String synonyms , String objectlink ) {
this . synonyms = synonyms ;
this . objectlink = objectlink ;
}
public SOTuple ( String [ ] synonyms , String objectlink ) {
StringBuilder sb = new StringBuilder ( synonyms . length * 10 ) ;
for ( String s : synonyms ) sb . append ( ',' ) . append ( s ) ;
this . synonyms = sb . substring ( 1 ) ;
this . objectlink = objectlink ;
}
public String getSynonymsCSV ( ) {
return this . synonyms ;
}
public String [ ] getSynonymsList ( ) {
return this . synonyms . split ( "," ) ;
}
public String getObjectlink ( ) {
return this . objectlink ;
}
}
public Tagging ( String name ) {
this . navigatorName = name ;
this . synonym2term = new ConcurrentHashMap < String , String > ( ) ;
@ -95,7 +129,7 @@ public class Tagging {
vocloop : for ( Map . Entry < String , SOTuple > e : table . entrySet ( ) ) {
if ( e . getValue ( ) . getSynonymsCSV ( ) = = null | | e . getValue ( ) . getSynonymsCSV ( ) . length ( ) = = 0 ) {
term = normalizeKey ( e . getKey ( ) ) ;
v = normalize Word ( e . getKey ( ) ) ;
v = normalize Term ( e . getKey ( ) ) ;
this . synonym2term . put ( v , term ) ;
this . term2synonym . put ( term , v ) ;
if ( e . getValue ( ) . getObjectlink ( ) ! = null & & e . getValue ( ) . getObjectlink ( ) . length ( ) > 0 ) this . term2objectlink . put ( term , e . getValue ( ) . getObjectlink ( ) ) ;
@ -108,13 +142,13 @@ public class Tagging {
tagloop : for ( String synonym : tags ) {
if ( synonym . length ( ) = = 0 ) continue tagloop ;
synonyms . add ( synonym ) ;
synonym = normalize Word ( synonym ) ;
synonym = normalize Term ( synonym ) ;
if ( synonym . length ( ) = = 0 ) continue tagloop ;
synonyms . add ( synonym ) ;
this . synonym2term . put ( synonym , term ) ;
this . term2synonym . put ( term , synonym ) ;
}
String synonym = normalize Word ( term ) ;
String synonym = normalize Term ( term ) ;
this . synonym2term . put ( synonym , term ) ;
this . term2synonym . put ( term , synonym ) ;
if ( e . getValue ( ) . getObjectlink ( ) ! = null & & e . getValue ( ) . getObjectlink ( ) . length ( ) > 0 ) this . term2objectlink . put ( term , e . getValue ( ) . getObjectlink ( ) ) ;
@ -129,49 +163,115 @@ public class Tagging {
for ( Map . Entry < String , SOTuple > e : table . entrySet ( ) ) {
String s = e . getValue ( ) = = null ? "" : e . getValue ( ) . getSynonymsCSV ( ) ;
String o = e . getValue ( ) = = null ? "" : e . getValue ( ) . getObjectlink ( ) ;
w . write ( e . getKey ( ) + ( s = = null | | s . length ( ) = = 0 ? "" : ":" + e . getValue ( ) ) + ( o = = null | | o . length ( ) = = 0 | | o . equals ( objectspace + e . getKey ( ) ) ? "" : "#" + o ) + "\n" ) ;
w . write ( e . getKey ( ) + ( s = = null | | s . length ( ) = = 0 ? "" : ":" + e . getValue ( ) .getSynonymsCSV ( ) ) + ( o = = null | | o . length ( ) = = 0 | | o . equals ( objectspace + e . getKey ( ) ) ? "" : "#" + o ) + "\n" ) ;
}
w . close ( ) ;
init ( ) ;
}
}
/ * *
* helper class : Synonym and Objectlink tuple
* /
public static class SOTuple {
private final String synonyms ;
private final String objectlink ;
public SOTuple ( String synonyms , String objectlink ) {
this . synonyms = synonyms ;
this . objectlink = objectlink ;
}
public SOTuple ( String [ ] synonyms , String objectlink ) {
StringBuilder sb = new StringBuilder ( synonyms . length * 10 ) ;
for ( String s : synonyms ) sb . append ( ',' ) . append ( s ) ;
this . synonyms = sb . substring ( 1 ) ;
this . objectlink = objectlink ;
public Tagging ( String name , Locations location ) {
this ( name ) ;
Set < String > locNames = location . locationNames ( ) ;
TreeSet < GeoLocation > geo ;
GeoLocation g ;
for ( String loc : locNames ) {
String syn = normalizeTerm ( loc ) ;
this . synonym2term . put ( syn , loc ) ;
this . term2synonym . put ( loc , syn ) ;
geo = location . find ( loc , true ) ;
if ( geo . size ( ) > 0 ) {
g = geo . iterator ( ) . next ( ) ;
this . term2objectlink . put ( loc , "http://www.openstreetmap.org/?lat=" + g . lat ( ) + "&lon=" + g . lon ( ) + "&zoom=16" ) ;
}
}
}
public String getSynonymsCSV ( ) {
return this . synonyms ;
public Tagging ( String name , Dictionary dictionary ) {
this ( name ) ;
Set < StringBuilder > words = dictionary . getWords ( ) ;
String s ;
for ( StringBuilder word : words ) {
s = word . toString ( ) ;
this . synonym2term . put ( s . toLowerCase ( ) , s ) ;
this . term2synonym . put ( s , s . toLowerCase ( ) ) ;
}
}
public String [ ] getSynonymsList ( ) {
return this . synonyms . split ( "," ) ;
}
public void init ( ) throws IOException {
if ( this . propFile = = null ) return ;
this . synonym2term . clear ( ) ;
this . term2synonym . clear ( ) ;
this . term2objectlink . clear ( ) ;
this . synonym2synonyms . clear ( ) ;
this . namespace = DEFAULT_NAMESPACE ;
this . predicate = this . namespace + this . navigatorName ;
this . objectspace = null ;
public String getObjectlink ( ) {
return this . objectlink ;
BlockingQueue < String > list = Files . concurentLineReader ( this . propFile , 1000 ) ;
String term , v ;
String [ ] tags ;
int p ;
String line ;
try {
vocloop : while ( ( line = list . take ( ) ) ! = Files . POISON_LINE ) {
line = line . trim ( ) ;
p = line . indexOf ( '#' ) ;
if ( p > = 0 ) {
String comment = line . substring ( p + 1 ) . trim ( ) ;
if ( comment . startsWith ( "namespace:" ) ) {
this . namespace = comment . substring ( 10 ) . trim ( ) ;
if ( ! this . namespace . endsWith ( "/" ) & & ! this . namespace . endsWith ( "#" ) & & this . namespace . length ( ) > 0 ) this . namespace + = "#" ;
this . predicate = this . namespace + this . navigatorName ;
continue vocloop ;
}
if ( comment . startsWith ( "objectspace:" ) ) {
this . objectspace = comment . substring ( 12 ) . trim ( ) ;
if ( ! this . objectspace . endsWith ( "/" ) & & ! this . objectspace . endsWith ( "#" ) & & this . objectspace . length ( ) > 0 ) this . objectspace + = "#" ;
continue vocloop ;
}
}
String [ ] pl = parseLine ( line ) ;
if ( pl = = null ) {
continue vocloop ;
}
if ( pl [ 1 ] = = null ) {
term = normalizeKey ( pl [ 0 ] ) ;
v = normalizeTerm ( pl [ 0 ] ) ;
this . synonym2term . put ( v , term ) ;
this . term2synonym . put ( term , v ) ;
if ( pl [ 2 ] ! = null & & pl [ 2 ] . length ( ) > 0 ) this . term2objectlink . put ( term , pl [ 2 ] ) ;
continue vocloop ;
}
term = normalizeKey ( pl [ 0 ] ) ;
v = pl [ 1 ] ;
tags = v . split ( "," ) ;
Set < String > synonyms = new HashSet < String > ( ) ;
synonyms . add ( term ) ;
tagloop : for ( String synonym : tags ) {
if ( synonym . length ( ) = = 0 ) continue tagloop ;
synonyms . add ( synonym ) ;
synonym = normalizeTerm ( synonym ) ;
if ( synonym . length ( ) = = 0 ) continue tagloop ;
synonyms . add ( synonym ) ;
this . synonym2term . put ( synonym , term ) ;
this . term2synonym . put ( term , synonym ) ;
}
String synonym = normalizeTerm ( term ) ;
this . synonym2term . put ( synonym , term ) ;
this . term2synonym . put ( term , synonym ) ;
if ( pl [ 2 ] ! = null & & pl [ 2 ] . length ( ) > 0 ) this . term2objectlink . put ( term , pl [ 2 ] ) ;
synonyms . add ( synonym ) ;
for ( String s : synonyms ) {
this . synonym2synonyms . put ( s , synonyms ) ;
}
}
} catch ( InterruptedException e ) {
}
}
public void updateTerm ( String term , String [ ] synonyms ) {
public int size ( ) {
return this . term2objectlink . size ( ) ;
}
private File tmpFile ( ) {
@ -363,98 +463,6 @@ public class Tagging {
return new String [ ] { line . substring ( 0 , p ) , line . substring ( p + 1 ) , c } ;
}
public void init ( ) throws IOException {
if ( this . propFile = = null ) return ;
this . synonym2term . clear ( ) ;
this . term2synonym . clear ( ) ;
this . term2objectlink . clear ( ) ;
this . synonym2synonyms . clear ( ) ;
this . namespace = DEFAULT_NAMESPACE ;
this . predicate = this . namespace + this . navigatorName ;
this . objectspace = null ;
BlockingQueue < String > list = Files . concurentLineReader ( this . propFile , 1000 ) ;
String term , v ;
String [ ] tags ;
int p ;
String line ;
try {
vocloop : while ( ( line = list . take ( ) ) ! = Files . POISON_LINE ) {
line = line . trim ( ) ;
p = line . indexOf ( '#' ) ;
if ( p > = 0 ) {
String comment = line . substring ( p + 1 ) . trim ( ) ;
if ( comment . startsWith ( "namespace:" ) ) {
this . namespace = comment . substring ( 10 ) . trim ( ) ;
if ( ! this . namespace . endsWith ( "/" ) & & ! this . namespace . endsWith ( "#" ) & & this . namespace . length ( ) > 0 ) this . namespace + = "#" ;
this . predicate = this . namespace + this . navigatorName ;
continue vocloop ;
}
if ( comment . startsWith ( "objectspace:" ) ) {
this . objectspace = comment . substring ( 12 ) . trim ( ) ;
if ( ! this . objectspace . endsWith ( "/" ) & & ! this . objectspace . endsWith ( "#" ) & & this . objectspace . length ( ) > 0 ) this . objectspace + = "#" ;
continue vocloop ;
}
}
String [ ] pl = parseLine ( line ) ;
if ( pl = = null ) {
continue vocloop ;
}
if ( pl [ 1 ] = = null ) {
term = normalizeKey ( pl [ 0 ] ) ;
v = normalizeWord ( pl [ 0 ] ) ;
this . synonym2term . put ( v , term ) ;
this . term2synonym . put ( term , v ) ;
if ( pl [ 2 ] ! = null & & pl [ 2 ] . length ( ) > 0 ) this . term2objectlink . put ( term , pl [ 2 ] ) ;
continue vocloop ;
}
term = normalizeKey ( pl [ 0 ] ) ;
v = pl [ 1 ] ;
tags = v . split ( "," ) ;
Set < String > synonyms = new HashSet < String > ( ) ;
synonyms . add ( term ) ;
tagloop : for ( String synonym : tags ) {
if ( synonym . length ( ) = = 0 ) continue tagloop ;
synonyms . add ( synonym ) ;
synonym = normalizeWord ( synonym ) ;
if ( synonym . length ( ) = = 0 ) continue tagloop ;
synonyms . add ( synonym ) ;
this . synonym2term . put ( synonym , term ) ;
this . term2synonym . put ( term , synonym ) ;
}
String synonym = normalizeWord ( term ) ;
this . synonym2term . put ( synonym , term ) ;
this . term2synonym . put ( term , synonym ) ;
if ( pl [ 2 ] ! = null & & pl [ 2 ] . length ( ) > 0 ) this . term2objectlink . put ( term , pl [ 2 ] ) ;
synonyms . add ( synonym ) ;
for ( String s : synonyms ) {
this . synonym2synonyms . put ( s , synonyms ) ;
}
}
} catch ( InterruptedException e ) {
}
}
public Tagging ( String name , Locations localization ) {
this ( name ) ;
Set < String > locNames = localization . locationNames ( ) ;
for ( String loc : locNames ) {
this . synonym2term . put ( loc . toLowerCase ( ) , loc ) ;
this . term2synonym . put ( loc , loc . toLowerCase ( ) ) ;
}
}
public Tagging ( String name , Dictionary dictionary ) {
this ( name ) ;
Set < StringBuilder > words = dictionary . getWords ( ) ;
String s ;
for ( StringBuilder word : words ) {
s = word . toString ( ) ;
this . synonym2term . put ( s . toLowerCase ( ) , s ) ;
this . term2synonym . put ( s , s . toLowerCase ( ) ) ;
}
}
/ * *
* get the predicate name which already contains the prefix url stub
* @return
@ -493,14 +501,14 @@ public class Tagging {
return this . propFile ;
}
public Metatag getMetatagFromSynonym ( char prefix , final String word ) {
public Metatag getMetatagFromSynonym ( final String word ) {
String printname = this . synonym2term . get ( word ) ;
if ( printname = = null ) return null ;
return new Metatag ( pr efix, pr intname) ;
return new Metatag ( pr intname) ;
}
public Metatag getMetatagFromTerm ( char prefix , final String word ) {
return new Metatag ( prefix, word) ;
public Metatag getMetatagFromTerm ( final String word ) {
return new Metatag ( word) ;
}
public Set < String > getSynonyms ( String term ) {
@ -532,20 +540,23 @@ public class Tagging {
private final static Pattern PATTERN_UE = Pattern . compile ( "\u00FC" ) ;
private final static Pattern PATTERN_SZ = Pattern . compile ( "\u00DF" ) ;
public static final String normalizeWord ( String word ) {
word = word . trim ( ) . toLowerCase ( ) ;
word = PATTERN_AE . matcher ( word ) . replaceAll ( "ae" ) ;
word = PATTERN_OE . matcher ( word ) . replaceAll ( "oe" ) ;
word = PATTERN_UE . matcher ( word ) . replaceAll ( "ue" ) ;
word = PATTERN_SZ . matcher ( word ) . replaceAll ( "ss" ) ;
return word ;
public static final String normalizeTerm ( String term ) {
term = term . trim ( ) . toLowerCase ( ) ;
term = PATTERN_AE . matcher ( term ) . replaceAll ( "ae" ) ;
term = PATTERN_OE . matcher ( term ) . replaceAll ( "oe" ) ;
term = PATTERN_UE . matcher ( term ) . replaceAll ( "ue" ) ;
term = PATTERN_SZ . matcher ( term ) . replaceAll ( "ss" ) ;
// remove comma
int p ;
while ( ( p = term . indexOf ( ',' ) ) > = 0 ) {
term = term . substring ( p + 1 ) . trim ( ) + " " + term . substring ( 0 , p ) ;
}
return term ;
}
public class Metatag {
private final String object ;
private final char prefix ;
public Metatag ( char prefix , String object ) {
this . prefix = prefix ;
public Metatag ( String object ) {
this . object = object ;
}
@ -563,7 +574,7 @@ public class Tagging {
@Override
public String toString ( ) {
return this . prefix + Tagging . this . navigatorName + ":" + encodePrintname ( this . object ) ;
return Tagging . this . navigatorName + ":" + encodePrintname ( this . object ) ;
}
@Override
@ -589,12 +600,12 @@ public class Tagging {
return PATTERN_UL . matcher ( maskname ) . replaceAll ( " " ) ;
}
public static String cleanTagFromAutotagging ( char prefix , final String tagString ) {
public static String cleanTagFromAutotagging ( final String tagString ) {
if ( tagString = = null | | tagString . length ( ) = = 0 ) return "" ;
String [ ] tags = PATTERN_SP . split ( tagString ) ;
StringBuilder sb = new StringBuilder ( tagString . length ( ) ) ;
for ( String tag : tags ) {
if ( tag . length ( ) > 0 & & tag . charAt ( 0 ) ! = prefix ) {
if ( tag . length ( ) > 0 ) {
sb . append ( tag ) . append ( ' ' ) ;
}
}