@ -125,7 +125,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
String b = cleanLine ( super . stripAll ( newtext ) ) ;
String b = cleanLine ( super . stripAll ( newtext ) ) ;
if ( ( insideTag ! = null ) & & ( ! ( insideTag . equals ( "a" ) ) ) ) {
if ( ( insideTag ! = null ) & & ( ! ( insideTag . equals ( "a" ) ) ) ) {
// texts inside tags sometimes have no punctuation at the line end
// texts inside tags sometimes have no punctuation at the line end
// this is bad for the text sema tics, because it is not possible for the
// this is bad for the text sema n tics, because it is not possible for the
// condenser to distinguish headlines from text beginnings.
// condenser to distinguish headlines from text beginnings.
// to make it easier for the condenser, a dot ('.') is appended in case that
// to make it easier for the condenser, a dot ('.') is appended in case that
// no punctuation is part of the newtext line
// no punctuation is part of the newtext line
@ -141,6 +141,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if ( p = = Integer . MAX_VALUE ) break ;
if ( p = = Integer . MAX_VALUE ) break ;
q = b . indexOf ( " " , p + 1 ) ;
q = b . indexOf ( " " , p + 1 ) ;
u = b . substring ( p , q < 0 ? b . length ( ) : q ) ;
u = b . substring ( p , q < 0 ? b . length ( ) : q ) ;
if ( u . endsWith ( "." ) ) u = u . substring ( 0 , u . length ( ) - 1 ) ; // remove the '.' that was appended above
s = p + 1 ;
s = p + 1 ;
try {
try {
url = new MultiProtocolURI ( u ) ;
url = new MultiProtocolURI ( u ) ;
@ -351,11 +352,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
s = getDescription ( ) ;
s = getDescription ( ) ;
if ( s . length ( ) > 0 ) return s ;
if ( s . length ( ) > 0 ) return s ;
// extract headline from content
// extract headline from file name
if ( content . length ( ) > 80 ) {
return MultiProtocolURI . unescape ( root . getFileName ( ) ) ;
return cleanLine ( new String ( content . getChars ( ) , 0 , 80 ) ) ;
}
return cleanLine ( content . trim ( ) . toString ( ) ) ;
}
}
public String [ ] getHeadlines ( final int i ) {
public String [ ] getHeadlines ( final int i ) {