@ -58,15 +58,15 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final char degree = '\u00B0' ;
private final char [ ] minuteCharsHTML = "'" . toCharArray ( ) ;
// statics: for initialization of the HTMLFilterAbstractScraper
private static final Set < String > linkTags0 = new HashSet < String > ( 9 , 0.99f ) ;
private static final Set < String > linkTags1 = new HashSet < String > ( 7 , 0.99f ) ;
public enum TagType {
singleton , pair ;
}
public enum Tag {
html ( TagType . singleton ) , // scraped as singleton to get attached properties like 'lang'
body ( TagType . singleton ) , // scraped as singleton to get attached properties like 'class'
@ -96,14 +96,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
script ( TagType . pair ) ;
public TagType type ;
private Tag ( TagType type ) {
private Tag ( final TagType type ) {
this . type = type ;
}
}
// all these tags must be given in lowercase, because the tags from the files are compared in lowercase
static {
for ( Tag tag : Tag . values ( ) ) {
for ( final Tag tag : Tag . values ( ) ) {
if ( tag . type = = TagType . singleton ) linkTags0 . add ( tag . name ( ) ) ;
if ( tag . type = = TagType . pair ) linkTags1 . add ( tag . name ( ) ) ;
}
@ -112,33 +112,33 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// class variables: collectors for links
private Map < MultiProtocolURI , Properties > anchors ;
private Map < MultiProtocolURI , String > rss , css ;
private Set < MultiProtocolURI > script , frames , iframes ;
private final Map < MultiProtocolURI , String > rss , css ;
private final Set < MultiProtocolURI > script , frames , iframes ;
private Map < MultiProtocolURI , ImageEntry > images ; // urlhash/image relation
private final Map < String , String > metas ;
private String title ;
//private String headline;
private List < String > [ ] headlines ;
private ClusteredScoreMap < String > bold , italic ;
private List < String > li ;
private final ClusteredScoreMap < String > bold , italic ;
private final List < String > li ;
private CharBuffer content ;
private final EventListenerList htmlFilterEventListeners ;
private float lon , lat ;
/ * *
* { @link MultiProtocolURI } to the favicon that belongs to the document
* /
private MultiProtocolURI favicon ;
/ * *
* The document root { @link MultiProtocolURI }
* The document root { @link MultiProtocolURI }
* /
private MultiProtocolURI root ;
/ * *
* evaluation scores : count appearance of specific attributes
* /
private Evaluation evaluationScores ;
private final Evaluation evaluationScores ;
@SuppressWarnings ( "unchecked" )
public ContentScraper ( final MultiProtocolURI root ) {
@ -157,7 +157,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this . script = new HashSet < MultiProtocolURI > ( ) ;
this . title = "" ;
this . headlines = new ArrayList [ 6 ] ;
for ( int i = 0 ; i < this . headlines . length ; i + + ) headlines[ i ] = new ArrayList < String > ( ) ;
for ( int i = 0 ; i < this . headlines . length ; i + + ) this . headlines[ i ] = new ArrayList < String > ( ) ;
this . bold = new ClusteredScoreMap < String > ( ) ;
this . italic = new ClusteredScoreMap < String > ( ) ;
this . li = new ArrayList < String > ( ) ;
@ -167,28 +167,28 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this . lat = 0.0f ;
this . evaluationScores . match ( Element . url , root . toNormalform ( false , false ) ) ;
}
public void scrapeText ( final char [ ] newtext , final String insideTag ) {
// System.out.println("SCRAPE: " + UTF8.String(newtext));
int p , pl , q , s = 0 ;
// match evaluation pattern
this . evaluationScores . match ( Element . text , newtext ) ;
// try to find location information in text
// Opencaching:
// <nobr>N 50o 05.453'</nobr><nobr>E 008o 30.191'</nobr>
// N 52o 28.025 E 013o 20.299
location : while ( s < newtext . length ) {
pl = 1 ;
p = CharBuffer . indexOf ( newtext , s , degree) ;
p = CharBuffer . indexOf ( newtext , s , this . degree) ;
if ( p < 0 ) { p = CharBuffer . indexOf ( newtext , s , "°" . toCharArray ( ) ) ; if ( p > = 0 ) pl = 5 ; }
if ( p < 0 ) break location ;
q = CharBuffer . indexOf ( newtext , p + pl , minuteCharsHTML) ;
q = CharBuffer . indexOf ( newtext , p + pl , this . minuteCharsHTML) ;
if ( q < 0 ) q = CharBuffer . indexOf ( newtext , p + pl , "'" . toCharArray ( ) ) ;
if ( q < 0 ) q = CharBuffer . indexOf ( newtext , p + pl , " E" . toCharArray ( ) ) ;
if ( q < 0 ) q = CharBuffer . indexOf ( newtext , p + pl , " W" . toCharArray ( ) ) ;
if ( q < 0 & & newtext . length - p = = 7 + pl ) q = newtext . length ;
if ( q < 0 & & newtext . length - p = = 7 + pl ) q = newtext . length ;
if ( q < 0 ) break location ;
int r = p ;
while ( r - - > 1 ) {
@ -254,22 +254,22 @@ public class ContentScraper extends AbstractScraper implements Scraper {
s = p + 6 ;
try {
url = new MultiProtocolURI ( u ) ;
anchors. put ( url , new Properties ( ) ) ;
this . anchors. put ( url , new Properties ( ) ) ;
continue ;
} catch ( MalformedURLException e ) { }
} catch ( final MalformedURLException e ) { }
}
// append string to content
if ( b . length ( ) ! = 0 ) content. append ( b ) . append ( 32 ) ;
if ( b . length ( ) ! = 0 ) this . content. append ( b ) . append ( 32 ) ;
}
private static final int find ( final String s , final String m , final int start ) {
final int p = s . indexOf ( m , start ) ;
return ( p < 0 ) ? Integer . MAX_VALUE : p ;
}
private MultiProtocolURI absolutePath ( final String relativePath ) {
try {
return MultiProtocolURI . newURL ( root, relativePath ) ;
return MultiProtocolURI . newURL ( this . root, relativePath ) ;
} catch ( final Exception e ) {
return null ;
}
@ -277,7 +277,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public void scrapeTag0 ( final String tagname , final Properties tagopts ) {
if ( tagname . equalsIgnoreCase ( "img" ) ) {
String src = tagopts . getProperty ( "src" , "" ) ;
final String src = tagopts . getProperty ( "src" , "" ) ;
try {
final int width = Integer . parseInt ( tagopts . getProperty ( "width" , "-1" ) ) ;
final int height = Integer . parseInt ( tagopts . getProperty ( "height" , "-1" ) ) ;
@ -285,48 +285,48 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final MultiProtocolURI url = absolutePath ( src ) ;
if ( url ! = null ) {
final ImageEntry ie = new ImageEntry ( url , tagopts . getProperty ( "alt" , "" ) , width , height , - 1 ) ;
addImage ( images, ie ) ;
addImage ( this . images, ie ) ;
}
}
} catch ( final NumberFormatException e ) { }
this . evaluationScores . match ( Element . imgpath , src ) ;
} else if ( tagname . equalsIgnoreCase ( "base" ) ) {
try {
root = new MultiProtocolURI ( tagopts . getProperty ( "href" , "" ) ) ;
this . root = new MultiProtocolURI ( tagopts . getProperty ( "href" , "" ) ) ;
} catch ( final MalformedURLException e ) { }
} else if ( tagname . equalsIgnoreCase ( "frame" ) ) {
MultiProtocolURI src = absolutePath ( tagopts . getProperty ( "src" , "" ) ) ;
anchors. put ( src , tagopts /* with property "name" */ ) ;
frames. add ( src ) ;
final MultiProtocolURI src = absolutePath ( tagopts . getProperty ( "src" , "" ) ) ;
this . anchors. put ( src , tagopts /* with property "name" */ ) ;
this . frames. add ( src ) ;
this . evaluationScores . match ( Element . framepath , src . toNormalform ( true , false ) ) ;
} else if ( tagname . equalsIgnoreCase ( "body" ) ) {
String c = tagopts . getProperty ( "class" , "" ) ;
final String c = tagopts . getProperty ( "class" , "" ) ;
this . evaluationScores . match ( Element . bodyclass , c ) ;
} else if ( tagname . equalsIgnoreCase ( "div" ) ) {
String id = tagopts . getProperty ( "id" , "" ) ;
final String id = tagopts . getProperty ( "id" , "" ) ;
this . evaluationScores . match ( Element . divid , id ) ;
} else if ( tagname . equalsIgnoreCase ( "meta" ) ) {
String name = tagopts . getProperty ( "name" , "" ) ;
String content = tagopts . getProperty ( "content" , "" ) ;
final String content = tagopts . getProperty ( "content" , "" ) ;
if ( name . length ( ) > 0 ) {
metas. put ( name . toLowerCase ( ) , CharacterCoding . html2unicode ( content ) ) ;
this . metas. put ( name . toLowerCase ( ) , CharacterCoding . html2unicode ( content ) ) ;
if ( name . equals ( "generator" ) ) {
this . evaluationScores . match ( Element . metagenerator , content ) ;
}
} else {
name = tagopts . getProperty ( "http-equiv" , "" ) ;
if ( name . length ( ) > 0 ) {
metas. put ( name . toLowerCase ( ) , CharacterCoding . html2unicode ( content ) ) ;
this . metas. put ( name . toLowerCase ( ) , CharacterCoding . html2unicode ( content ) ) ;
}
}
} else if ( tagname . equalsIgnoreCase ( "area" ) ) {
final String areatitle = cleanLine ( tagopts . getProperty ( "title" , "" ) ) ;
//String alt = tagopts.getProperty("alt","");
final String href = tagopts . getProperty ( "href" , "" ) ;
Properties p = new Properties ( ) ; p . put ( "name" , areatitle ) ;
if ( href . length ( ) > 0 ) anchors. put ( absolutePath ( href ) , p ) ;
final Properties p = new Properties ( ) ; p . put ( "name" , areatitle ) ;
if ( href . length ( ) > 0 ) this . anchors. put ( absolutePath ( href ) , p ) ;
} else if ( tagname . equalsIgnoreCase ( "link" ) ) {
String href = tagopts . getProperty ( "href" , "" ) ;
final String href = tagopts . getProperty ( "href" , "" ) ;
final MultiProtocolURI newLink = absolutePath ( href ) ;
if ( newLink ! = null ) {
@ -336,31 +336,31 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if ( rel . equalsIgnoreCase ( "shortcut icon" ) ) {
final ImageEntry ie = new ImageEntry ( newLink , linktitle , - 1 , - 1 , - 1 ) ;
images. put ( ie . url ( ) , ie ) ;
this . images. put ( ie . url ( ) , ie ) ;
this . favicon = newLink ;
} else if ( rel . equalsIgnoreCase ( "alternate" ) & & type . equalsIgnoreCase ( "application/rss+xml" ) ) {
rss. put ( newLink , linktitle ) ;
this . rss. put ( newLink , linktitle ) ;
} else if ( rel . equalsIgnoreCase ( "stylesheet" ) & & type . equalsIgnoreCase ( "text/css" ) ) {
css. put ( newLink , rel ) ;
this . css. put ( newLink , rel ) ;
this . evaluationScores . match ( Element . csspath , href ) ;
} else if ( ! rel . equalsIgnoreCase ( "stylesheet" ) & & ! rel . equalsIgnoreCase ( "alternate stylesheet" ) ) {
Properties p = new Properties ( ) ; p . put ( "name" , linktitle ) ;
anchors. put ( newLink , p ) ;
final Properties p = new Properties ( ) ; p . put ( "name" , linktitle ) ;
this . anchors. put ( newLink , p ) ;
}
}
} else if ( tagname . equalsIgnoreCase ( "embed" ) ) {
anchors. put ( absolutePath ( tagopts . getProperty ( "src" , "" ) ) , tagopts /* with property "name" */ ) ;
this . anchors. put ( absolutePath ( tagopts . getProperty ( "src" , "" ) ) , tagopts /* with property "name" */ ) ;
} else if ( tagname . equalsIgnoreCase ( "param" ) ) {
final String name = tagopts . getProperty ( "name" , "" ) ;
if ( name . equalsIgnoreCase ( "movie" ) ) {
anchors. put ( absolutePath ( tagopts . getProperty ( "value" , "" ) ) , tagopts /* with property "name" */ ) ;
this . anchors. put ( absolutePath ( tagopts . getProperty ( "value" , "" ) ) , tagopts /* with property "name" */ ) ;
}
}
// fire event
fireScrapeTag0 ( tagname , tagopts ) ;
}
public void scrapeTag1 ( final String tagname , final Properties tagopts , final char [ ] text ) {
// System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text));
if ( tagname . equalsIgnoreCase ( "a" ) & & text . length < 2048 ) {
@ -373,10 +373,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if ( type . equals ( "png" ) | | type . equals ( "gif" ) | | type . equals ( "jpg" ) | | type . equals ( "jpeg" ) | | type . equals ( "tiff" ) | | type . equals ( "tif" ) ) {
// special handling of such urls: put them to the image urls
final ImageEntry ie = new ImageEntry ( url , recursiveParse ( text ) , - 1 , - 1 , - 1 ) ;
addImage ( images, ie ) ;
addImage ( this . images, ie ) ;
} else {
tagopts . put ( "name" , recursiveParse ( text ) ) ;
anchors. put ( url , tagopts ) ;
this . anchors. put ( url , tagopts ) ;
}
}
this . evaluationScores . match ( Element . apath , href ) ;
@ -384,45 +384,45 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final String h ;
if ( ( tagname . equalsIgnoreCase ( "h1" ) ) & & ( text . length < 1024 ) ) {
h = recursiveParse ( text ) ;
if ( h . length ( ) > 0 ) headlines[ 0 ] . add ( h ) ;
if ( h . length ( ) > 0 ) this . headlines[ 0 ] . add ( h ) ;
} else if ( ( tagname . equalsIgnoreCase ( "h2" ) ) & & ( text . length < 1024 ) ) {
h = recursiveParse ( text ) ;
if ( h . length ( ) > 0 ) headlines[ 1 ] . add ( h ) ;
if ( h . length ( ) > 0 ) this . headlines[ 1 ] . add ( h ) ;
} else if ( ( tagname . equalsIgnoreCase ( "h3" ) ) & & ( text . length < 1024 ) ) {
h = recursiveParse ( text ) ;
if ( h . length ( ) > 0 ) headlines[ 2 ] . add ( h ) ;
if ( h . length ( ) > 0 ) this . headlines[ 2 ] . add ( h ) ;
} else if ( ( tagname . equalsIgnoreCase ( "h4" ) ) & & ( text . length < 1024 ) ) {
h = recursiveParse ( text ) ;
if ( h . length ( ) > 0 ) headlines[ 3 ] . add ( h ) ;
if ( h . length ( ) > 0 ) this . headlines[ 3 ] . add ( h ) ;
} else if ( ( tagname . equalsIgnoreCase ( "h5" ) ) & & ( text . length < 1024 ) ) {
h = recursiveParse ( text ) ;
if ( h . length ( ) > 0 ) headlines[ 4 ] . add ( h ) ;
if ( h . length ( ) > 0 ) this . headlines[ 4 ] . add ( h ) ;
} else if ( ( tagname . equalsIgnoreCase ( "h6" ) ) & & ( text . length < 1024 ) ) {
h = recursiveParse ( text ) ;
if ( h . length ( ) > 0 ) headlines[ 5 ] . add ( h ) ;
if ( h . length ( ) > 0 ) this . headlines[ 5 ] . add ( h ) ;
} else if ( ( tagname . equalsIgnoreCase ( "title" ) ) & & ( text . length < 1024 ) ) {
title = recursiveParse ( text ) ;
this . title = recursiveParse ( text ) ;
} else if ( ( tagname . equalsIgnoreCase ( "b" ) ) & & ( text . length < 1024 ) ) {
h = recursiveParse ( text ) ;
if ( h . length ( ) > 0 ) bold. inc ( h ) ;
if ( h . length ( ) > 0 ) this . bold. inc ( h ) ;
} else if ( ( tagname . equalsIgnoreCase ( "strong" ) ) & & ( text . length < 1024 ) ) {
h = recursiveParse ( text ) ;
if ( h . length ( ) > 0 ) bold. inc ( h ) ;
if ( h . length ( ) > 0 ) this . bold. inc ( h ) ;
} else if ( ( tagname . equalsIgnoreCase ( "i" ) ) & & ( text . length < 1024 ) ) {
h = recursiveParse ( text ) ;
if ( h . length ( ) > 0 ) italic. inc ( h ) ;
if ( h . length ( ) > 0 ) this . italic. inc ( h ) ;
} else if ( ( tagname . equalsIgnoreCase ( "li" ) ) & & ( text . length < 1024 ) ) {
h = recursiveParse ( text ) ;
if ( h . length ( ) > 0 ) li. add ( h ) ;
if ( h . length ( ) > 0 ) this . li. add ( h ) ;
} else if ( tagname . equalsIgnoreCase ( "iframe" ) ) {
MultiProtocolURI src = absolutePath ( tagopts . getProperty ( "src" , "" ) ) ;
anchors. put ( src , tagopts /* with property "name" */ ) ;
iframes. add ( src ) ;
final MultiProtocolURI src = absolutePath ( tagopts . getProperty ( "src" , "" ) ) ;
this . anchors. put ( src , tagopts /* with property "name" */ ) ;
this . iframes. add ( src ) ;
this . evaluationScores . match ( Element . iframepath , src . toNormalform ( true , false ) ) ;
} else if ( tagname . equalsIgnoreCase ( "script" ) ) {
String src = tagopts . getProperty ( "src" , "" ) ;
final String src = tagopts . getProperty ( "src" , "" ) ;
if ( src . length ( ) > 0 ) {
script. add ( absolutePath ( src ) ) ;
this . script. add ( absolutePath ( src ) ) ;
this . evaluationScores . match ( Element . scriptpath , src ) ;
} else {
this . evaluationScores . match ( Element . scriptcode , text ) ;
@ -432,7 +432,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// fire event
fireScrapeTag1 ( tagname , tagopts , text ) ;
}
public void scrapeComment ( final char [ ] comment ) {
this . evaluationScores . match ( Element . comment , comment ) ;
@ -440,28 +440,30 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private String recursiveParse ( final char [ ] inlineHtml ) {
if ( inlineHtml . length < 14 ) return cleanLine ( super . stripAll ( inlineHtml ) ) ;
// start a new scraper to parse links inside this text
// parsing the content
final ContentScraper scraper = new ContentScraper ( this . root ) ;
final ContentScraper scraper = new ContentScraper ( this . root ) ;
final TransformerWriter writer = new TransformerWriter ( null , null , scraper , null , false ) ;
try {
FileUtils . copy ( new CharArrayReader ( inlineHtml ) , writer ) ;
writer . close ( ) ;
} catch ( IOException e ) {
} catch ( final IOException e ) {
Log . logException ( e ) ;
return cleanLine ( super . stripAll ( inlineHtml ) ) ;
}
this . anchors . putAll ( scraper . getAnchors ( ) ) ;
this . images . putAll ( scraper . images ) ;
return cleanLine ( super . stripAll ( scraper . content . getChars ( ) ) ) ;
}
private final static String cleanLine ( final String s ) {
final StringBuilder sb = new StringBuilder ( s . length ( ) ) ;
char l = ' ' ;
for ( char c : s . toCharArray ( ) ) {
char c ;
for ( int i = 0 ; i < s . length ( ) ; i + + ) {
c = s . charAt ( i ) ;
if ( c < ' ' ) c = ' ' ;
if ( c = = ' ' ) {
if ( l ! = ' ' ) sb . append ( c ) ;
@ -470,91 +472,91 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
l = c ;
}
// return result
return sb . toString ( ) . trim ( ) ;
}
public String getTitle ( ) {
// construct a title string, even if the document has no title
// some documents have a title tag as meta tag
String s = metas. get ( "title" ) ;
String s = this . metas. get ( "title" ) ;
// try to construct the title with the content of the title tag
if ( title. length ( ) > 0 ) {
if ( this . title. length ( ) > 0 ) {
if ( s = = null ) {
return title;
return this . title;
}
if ( ( title. compareToIgnoreCase ( s ) = = 0 ) | | ( title. indexOf ( s ) > = 0 ) ) return s ;
return title + ": " + s ;
if ( ( this . title. compareToIgnoreCase ( s ) = = 0 ) | | ( this . title. indexOf ( s ) > = 0 ) ) return s ;
return this . title + ": " + s ;
}
if ( s ! = null ) {
return s ;
}
// otherwise take any headline
for ( int i = 0 ; i < this . headlines . length ; i + + ) {
if ( ! this . headlines [ i ] . isEmpty ( ) ) return this . headlines [ i ] . get ( 0 ) ;
}
// take description tag
s = getDescription ( ) ;
if ( s . length ( ) > 0 ) return s ;
// extract headline from file name
return MultiProtocolURI . unescape ( root. getFileName ( ) ) ;
return MultiProtocolURI . unescape ( this . root. getFileName ( ) ) ;
}
public String [ ] getHeadlines ( final int i ) {
assert ( ( i > = 1 ) & & ( i < = this . headlines . length ) ) ;
return this . headlines [ i - 1 ] . toArray ( new String [ this . headlines [ i - 1 ] . size ( ) ] ) ;
}
public String [ ] getBold ( ) {
List < String > a = new ArrayList < String > ( ) ;
Iterator < String > i = this . bold . keys ( false ) ;
while ( i . hasNext ( ) ) a . add ( i . next ( ) ) ;
final List < String > a = new ArrayList < String > ( ) ;
final Iterator < String > i = this . bold . keys ( false ) ;
while ( i . hasNext ( ) ) a . add ( i . next ( ) ) ;
return a . toArray ( new String [ a . size ( ) ] ) ;
}
public String [ ] getBoldCount ( String [ ] a ) {
String [ ] counter = new String [ a . length ] ;
public String [ ] getBoldCount ( final String [ ] a ) {
final String [ ] counter = new String [ a . length ] ;
for ( int i = 0 ; i < a . length ; i + + ) counter [ i ] = Integer . toString ( this . bold . get ( a [ i ] ) ) ;
return counter ;
}
public String [ ] getItalic ( ) {
List < String > a = new ArrayList < String > ( ) ;
Iterator < String > i = this . italic . keys ( false ) ;
while ( i . hasNext ( ) ) a . add ( i . next ( ) ) ;
final List < String > a = new ArrayList < String > ( ) ;
final Iterator < String > i = this . italic . keys ( false ) ;
while ( i . hasNext ( ) ) a . add ( i . next ( ) ) ;
return a . toArray ( new String [ a . size ( ) ] ) ;
}
public String [ ] getItalicCount ( String [ ] a ) {
String [ ] counter = new String [ a . length ] ;
public String [ ] getItalicCount ( final String [ ] a ) {
final String [ ] counter = new String [ a . length ] ;
for ( int i = 0 ; i < a . length ; i + + ) counter [ i ] = Integer . toString ( this . italic . get ( a [ i ] ) ) ;
return counter ;
}
public String [ ] getLi ( ) {
return this . li . toArray ( new String [ this . li . size ( ) ] ) ;
}
public boolean containsFlash ( ) {
this . anchors = new HashMap < MultiProtocolURI , Properties > ( ) ;
String ext ;
for ( MultiProtocolURI url : this . anchors . keySet ( ) ) {
for ( final MultiProtocolURI url : this . anchors . keySet ( ) ) {
ext = url . getFileExtension ( ) ;
if ( ext = = null ) continue ;
if ( ext . equals ( "swf" ) ) return true ;
}
return false ;
}
public byte [ ] getText ( ) {
try {
return content. getBytes ( ) ;
return this . content. getBytes ( ) ;
} catch ( final OutOfMemoryError e ) {
Log . logException ( e ) ;
return new byte [ 0 ] ;
@ -563,31 +565,31 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public Map < MultiProtocolURI , Properties > getAnchors ( ) {
// returns a url (String) / name (String) relation
return anchors;
return this . anchors;
}
public Map < MultiProtocolURI , String > getRSS ( ) {
// returns a url (String) / name (String) relation
return rss;
return this . rss;
}
public Map < MultiProtocolURI , String > getCSS ( ) {
// returns a url (String) / name (String) relation
return css;
return this . css;
}
public Set < MultiProtocolURI > getFrames ( ) {
// returns a url (String) / name (String) relation
return frames;
return this . frames;
}
public Set < MultiProtocolURI > getIFrames ( ) {
// returns a url (String) / name (String) relation
return iframes;
return this . iframes;
}
public Set < MultiProtocolURI > getScript ( ) {
return script;
return this . script;
}
/ * *
@ -596,16 +598,16 @@ public class ContentScraper extends AbstractScraper implements Scraper {
* /
public Map < MultiProtocolURI , ImageEntry > getImages ( ) {
// this resturns a String(absolute url)/htmlFilterImageEntry - relation
return images;
return this . images;
}
public Map < String , String > getMetas ( ) {
return metas;
return this . metas;
}
/ * *
* @return the { @link MultiProtocolURI } to the favicon that belongs to the document
* /
* /
public MultiProtocolURI getFavicon ( ) {
return this . favicon ;
}
@ -618,52 +620,52 @@ public class ContentScraper extends AbstractScraper implements Scraper {
< meta name = "DC.format" scheme = "DCTERMS.IMT" content = "text/html" / >
< meta name = "DC.type" scheme = "DCTERMS.DCMIType" content = "Text" / >
* /
public boolean indexingDenied ( ) {
String s = metas. get ( "robots" ) ;
final String s = this . metas. get ( "robots" ) ;
if ( s = = null ) return false ;
if ( s . indexOf ( "noindex" ) > = 0 ) return true ;
return false ;
}
public String getDescription ( ) {
String s = metas. get ( "description" ) ;
if ( s = = null ) s = metas. get ( "dc.description" ) ;
String s = this . metas. get ( "description" ) ;
if ( s = = null ) s = this . metas. get ( "dc.description" ) ;
if ( s = = null ) return "" ;
return s ;
}
public String getContentType ( ) {
final String s = metas. get ( "content-type" ) ;
final String s = this . metas. get ( "content-type" ) ;
if ( s = = null ) return "" ;
return s ;
}
public String getAuthor ( ) {
String s = metas. get ( "author" ) ;
if ( s = = null ) s = metas. get ( "dc.creator" ) ;
String s = this . metas. get ( "author" ) ;
if ( s = = null ) s = this . metas. get ( "dc.creator" ) ;
if ( s = = null ) return "" ;
return s ;
}
public String getPublisher ( ) {
String s = metas. get ( "copyright" ) ;
if ( s = = null ) s = metas. get ( "dc.publisher" ) ;
String s = this . metas. get ( "copyright" ) ;
if ( s = = null ) s = this . metas. get ( "dc.publisher" ) ;
if ( s = = null ) return "" ;
return s ;
}
private final static Pattern commaSepPattern = Pattern . compile ( " |," ) ;
private final static Pattern semicSepPattern = Pattern . compile ( " |;" ) ;
public Set < String > getContentLanguages ( ) {
// i.e. <meta name="DC.language" content="en" scheme="DCTERMS.RFC3066">
// or <meta http-equiv="content-language" content="en">
String s = metas. get ( "content-language" ) ;
if ( s = = null ) s = metas. get ( "dc.language" ) ;
String s = this . metas. get ( "content-language" ) ;
if ( s = = null ) s = this . metas. get ( "dc.language" ) ;
if ( s = = null ) return null ;
Set < String > hs = new HashSet < String > ( ) ;
String [ ] cl = commaSepPattern . split ( s ) ;
final Set < String > hs = new HashSet < String > ( ) ;
final String [ ] cl = commaSepPattern . split ( s ) ;
int p ;
for ( int i = 0 ; i < cl . length ; i + + ) {
cl [ i ] = cl [ i ] . toLowerCase ( ) ;
@ -674,10 +676,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if ( hs . isEmpty ( ) ) return null ;
return hs ;
}
public String [ ] getKeywords ( ) {
String s = metas. get ( "keywords" ) ;
if ( s = = null ) s = metas. get ( "dc.description" ) ;
String s = this . metas. get ( "keywords" ) ;
if ( s = = null ) s = this . metas. get ( "dc.description" ) ;
if ( s = = null ) s = "" ;
if ( s . length ( ) = = 0 ) {
return MultiProtocolURI . splitpattern . split ( getTitle ( ) . toLowerCase ( ) ) ;
@ -686,9 +688,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if ( s . contains ( ";" ) ) return semicSepPattern . split ( s ) ;
return s . split ( "\\s" ) ;
}
public int getRefreshSeconds ( ) {
final String s = metas. get ( "refresh" ) ;
final String s = this . metas. get ( "refresh" ) ;
if ( s = = null ) return 9999 ;
try {
final int pos = s . indexOf ( ';' ) ;
@ -701,9 +703,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
public String getRefreshPath ( ) {
String s = metas. get ( "refresh" ) ;
String s = this . metas. get ( "refresh" ) ;
if ( s = = null ) return "" ;
final int pos = s . indexOf ( ';' ) ;
if ( pos < 0 ) return "" ;
s = s . substring ( pos + 1 ) ;
@ -714,10 +716,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// parse location
// <meta NAME="ICBM" CONTENT="38.90551492, 1.454004505" />
// <meta NAME="geo.position" CONTENT="38.90551492;1.454004505" />
public float getLon ( ) {
if ( this . lon ! = 0.0f ) return this . lon ;
String s = metas. get ( "ICBM" ) ; // InterContinental Ballistic Missile (abbrev. supposed to be a joke: http://www.jargon.net/jargonfile/i/ICBMaddress.html), see http://geourl.org/add.html#icbm
String s = this . metas. get ( "ICBM" ) ; // InterContinental Ballistic Missile (abbrev. supposed to be a joke: http://www.jargon.net/jargonfile/i/ICBMaddress.html), see http://geourl.org/add.html#icbm
if ( s ! = null ) {
int p = s . indexOf ( ';' ) ;
if ( p < 0 ) p = s . indexOf ( ',' ) ;
@ -728,7 +730,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
}
if ( this . lon ! = 0.0f ) return this . lon ;
s = metas. get ( "geo.position" ) ; // http://geotags.com/geobot/add-tags.html
s = this . metas. get ( "geo.position" ) ; // http://geotags.com/geobot/add-tags.html
if ( s ! = null ) {
int p = s . indexOf ( ';' ) ;
if ( p < 0 ) p = s . indexOf ( ',' ) ;
@ -740,13 +742,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
return this . lon ;
}
public float getLat ( ) {
if ( this . lat ! = 0.0f ) return this . lat ;
getLon ( ) ; // parse with getLon() method which creates also the lat value
return this . lat ;
}
/ * *
* produce all model names
* @return a set of model names
@ -754,26 +756,26 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public Set < String > getEvaluationModelNames ( ) {
return this . evaluationScores . getModelNames ( ) ;
}
public String [ ] getEvaluationModelScoreNames ( String modelName ) {
List < String > a = new ArrayList < String > ( ) ;
ClusteredScoreMap < String > scores = this . evaluationScores . getScores ( modelName ) ;
public String [ ] getEvaluationModelScoreNames ( final String modelName ) {
final List < String > a = new ArrayList < String > ( ) ;
final ClusteredScoreMap < String > scores = this . evaluationScores . getScores ( modelName ) ;
if ( scores ! = null ) {
Iterator < String > i = scores . keys ( false ) ;
final Iterator < String > i = scores . keys ( false ) ;
while ( i . hasNext ( ) ) a . add ( i . next ( ) ) ;
}
return a . toArray ( new String [ a . size ( ) ] ) ;
}
public String [ ] getEvaluationModelScoreCounts ( String modelName , String [ ] a ) {
ClusteredScoreMap < String > scores = this . evaluationScores . getScores ( modelName ) ;
String [ ] counter = new String [ a . length ] ;
public String [ ] getEvaluationModelScoreCounts ( final String modelName , final String [ ] a ) {
final ClusteredScoreMap < String > scores = this . evaluationScores . getScores ( modelName ) ;
final String [ ] counter = new String [ a . length ] ;
if ( scores ! = null ) {
for ( int i = 0 ; i < a . length ; i + + ) counter [ i ] = Integer . toString ( scores . get ( a [ i ] ) ) ;
}
return counter ;
}
/ *
* ( non - Javadoc )
* @see de . anomic . htmlFilter . htmlFilterScraper # close ( )
@ -782,37 +784,37 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public void close ( ) {
// free resources
super . close ( ) ;
anchors = null ;
images = null ;
title = null ;
headlines = null ;
content = null ;
root = null ;
this . anchors = null ;
this . images = null ;
this . title = null ;
this . headlines = null ;
this . content = null ;
this . root = null ;
}
public void print ( ) {
System . out . println ( "TITLE :" + title) ;
System . out . println ( "TITLE :" + this . title) ;
for ( int i = 0 ; i < 4 ; i + + ) {
System . out . println ( "HEADLINE" + i + ":" + headlines[ i ] . toString ( ) ) ;
System . out . println ( "HEADLINE" + i + ":" + this . headlines[ i ] . toString ( ) ) ;
}
System . out . println ( "ANCHORS :" + anchors. toString ( ) ) ;
System . out . println ( "IMAGES :" + images. toString ( ) ) ;
System . out . println ( "METAS :" + metas. toString ( ) ) ;
System . out . println ( "TEXT :" + content. toString ( ) ) ;
System . out . println ( "ANCHORS :" + this . anchors. toString ( ) ) ;
System . out . println ( "IMAGES :" + this . images. toString ( ) ) ;
System . out . println ( "METAS :" + this . metas. toString ( ) ) ;
System . out . println ( "TEXT :" + this . content. toString ( ) ) ;
}
public void registerHtmlFilterEventListener ( final ScraperListener listener ) {
if ( listener ! = null ) {
this . htmlFilterEventListeners . add ( ScraperListener . class , listener ) ;
}
}
}
public void deregisterHtmlFilterEventListener ( final ScraperListener listener ) {
if ( listener ! = null ) {
this . htmlFilterEventListeners . remove ( ScraperListener . class , listener ) ;
}
}
}
private void fireScrapeTag0 ( final String tagname , final Properties tagopts ) {
final Object [ ] listeners = this . htmlFilterEventListeners . getListenerList ( ) ;
for ( int i = 0 ; i < listeners . length ; i + = 2 ) {
@ -820,8 +822,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
( ( ScraperListener ) listeners [ i + 1 ] ) . scrapeTag0 ( tagname , tagopts ) ;
}
}
}
}
private void fireScrapeTag1 ( final String tagname , final Properties tagopts , final char [ ] text ) {
final Object [ ] listeners = this . htmlFilterEventListeners . getListenerList ( ) ;
for ( int i = 0 ; i < listeners . length ; i + = 2 ) {
@ -830,26 +832,26 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
}
}
public static ContentScraper parseResource ( final File file ) throws IOException {
// load page
final byte [ ] page = FileUtils . read ( file ) ;
if ( page = = null ) throw new IOException ( "no content in file " + file . toString ( ) ) ;
// scrape document to look up charset
final ScraperInputStream htmlFilter = new ScraperInputStream ( new ByteArrayInputStream ( page ) , "UTF-8" , new MultiProtocolURI ( "http://localhost" ) , null , false ) ;
String charset = htmlParser . patchCharsetEncoding ( htmlFilter . detectCharset ( ) ) ;
if ( charset = = null )
charset = Charset . defaultCharset ( ) . toString ( ) ;
// scrape content
final ContentScraper scraper = new ContentScraper ( new MultiProtocolURI ( "http://localhost" ) ) ;
final Writer writer = new TransformerWriter ( null , null , scraper , null , false ) ;
FileUtils . copy ( new ByteArrayInputStream ( page ) , writer , Charset . forName ( charset ) ) ;
return scraper ;
}
public static void addAllImages ( final Map < MultiProtocolURI , ImageEntry > a , final Map < MultiProtocolURI , ImageEntry > b ) {
final Iterator < Map . Entry < MultiProtocolURI , ImageEntry > > i = b . entrySet ( ) . iterator ( ) ;
Map . Entry < MultiProtocolURI , ImageEntry > ie ;
@ -858,7 +860,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
addImage ( a , ie . getValue ( ) ) ;
}
}
public static void addImage ( final Map < MultiProtocolURI , ImageEntry > a , final ImageEntry ie ) {
if ( a . containsKey ( ie . url ( ) ) ) {
// in case of a collision, take that image that has the better image size tags
@ -867,6 +869,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
a . put ( ie . url ( ) , ie ) ;
}
}
}