@ -36,8 +36,10 @@ import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException ;
import java.nio.charset.StandardCharsets ;
import java.nio.charset.UnsupportedCharsetException ;
import java.util.HashSet ;
import java.util.LinkedHashMap ;
import java.util.Locale ;
import java.util.Set ;
import org.apache.commons.io.IOUtils ;
@ -105,7 +107,20 @@ public class htmlParser extends AbstractParser implements Parser {
final int timezoneOffset ,
final InputStream sourceStream ) throws Parser . Failure , InterruptedException {
return parseWithLimits ( location , mimeType , documentCharset , vocscraper , timezoneOffset , sourceStream , Integer . MAX_VALUE , DEFAULT_MAX_LINKS , Long . MAX_VALUE ) ;
return parseWithLimits ( location , mimeType , documentCharset , new HashSet < String > ( ) , vocscraper , timezoneOffset , sourceStream , Integer . MAX_VALUE , DEFAULT_MAX_LINKS , Long . MAX_VALUE ) ;
}
@Override
public Document [ ] parse (
final DigestURL location ,
final String mimeType ,
final String documentCharset ,
final Set < String > ignore_class_name ,
final VocabularyScraper vocscraper ,
final int timezoneOffset ,
final InputStream sourceStream ) throws Parser . Failure , InterruptedException {
return parseWithLimits ( location , mimeType , documentCharset , ignore_class_name , vocscraper , timezoneOffset , sourceStream , Integer . MAX_VALUE , DEFAULT_MAX_LINKS , Long . MAX_VALUE ) ;
}
@Override
@ -114,19 +129,20 @@ public class htmlParser extends AbstractParser implements Parser {
}
@Override
public Document [ ] parseWithLimits ( final DigestURL location , final String mimeType , final String documentCharset , final VocabularyScraper vocscraper ,
public Document [ ] parseWithLimits ( final DigestURL location , final String mimeType , final String documentCharset ,
final Set < String > ignore_class_name , final VocabularyScraper vocscraper ,
final int timezoneOffset , final InputStream sourceStream , final int maxLinks , final long maxBytes )
throws Failure {
return parseWithLimits ( location , mimeType , documentCharset , vocscraper, timezoneOffset , sourceStream , maxLinks , maxLinks , maxBytes ) ;
return parseWithLimits ( location , mimeType , documentCharset , ignore_class_name, vocscraper, timezoneOffset , sourceStream , maxLinks , maxLinks , maxBytes ) ;
}
private Document [ ] parseWithLimits ( final DigestURL location , final String mimeType , final String documentCharset , final VocabularyScraper vocscraper ,
private Document [ ] parseWithLimits ( final DigestURL location , final String mimeType , final String documentCharset , final Set< String > ignore_class_name , final VocabularyScraper vocscraper ,
final int timezoneOffset , final InputStream sourceStream , final int maxAnchors , final int maxLinks , final long maxBytes )
throws Failure {
try {
// first get a document from the parsed html
Charset [ ] detectedcharsetcontainer = new Charset [ ] { null } ;
ContentScraper scraper = parseToScraper ( location , documentCharset , vocscraper, detectedcharsetcontainer , timezoneOffset , sourceStream , maxAnchors , maxLinks , maxBytes ) ;
ContentScraper scraper = parseToScraper ( location , documentCharset , ignore_class_name, vocscraper, detectedcharsetcontainer , timezoneOffset , sourceStream , maxAnchors , maxLinks , maxBytes ) ;
// parseToScraper also detects/corrects/sets charset from html content tag
final Document document = transformScraper ( location , mimeType , detectedcharsetcontainer [ 0 ] . name ( ) , scraper ) ;
Document documentSnapshot = null ;
@ -135,10 +151,10 @@ public class htmlParser extends AbstractParser implements Parser {
// and create a sub-document for snapshot page (which will be merged by loader)
// TODO: as a crawl request removes anchor part from original url getRef() is never successful - considere other handling as removeRef() in crawler
if ( location . getRef ( ) ! = null & & location . getRef ( ) . startsWith ( "!" ) ) {
documentSnapshot = parseAlternativeSnapshot ( location , mimeType , documentCharset , vocscraper, timezoneOffset , maxAnchors , maxLinks , maxBytes ) ;
documentSnapshot = parseAlternativeSnapshot ( location , mimeType , documentCharset , ignore_class_name, vocscraper, timezoneOffset , maxAnchors , maxLinks , maxBytes ) ;
} else { // head tag fragment only allowed on url without anchor hashfragment, but there are discussions that existence of hashfragment anchor takes preference (means allow both)
if ( scraper . getMetas ( ) . containsKey ( "fragment" ) & & scraper . getMetas ( ) . get ( "fragment" ) . equals ( "!" ) ) {
documentSnapshot = parseAlternativeSnapshot ( location , mimeType , documentCharset , vocscraper, timezoneOffset , maxAnchors , maxLinks , maxBytes ) ;
documentSnapshot = parseAlternativeSnapshot ( location , mimeType , documentCharset , ignore_class_name, vocscraper, timezoneOffset , maxAnchors , maxLinks , maxBytes ) ;
}
}
} catch ( Exception ex1 ) { // ignore any exception for any issue with snapshot
@ -203,7 +219,7 @@ public class htmlParser extends AbstractParser implements Parser {
return ppd ;
}
public static ContentScraper parseToScraper ( final DigestURL location , final String documentCharset , final VocabularyScraper vocabularyScraper , final int timezoneOffset , final String input , final int maxAnchors , final int maxLinks ) throws IOException {
public static ContentScraper parseToScraper ( final DigestURL location , final String documentCharset , final Set< String > ignore_class_name , final VocabularyScraper vocabularyScraper , final int timezoneOffset , final String input , final int maxAnchors , final int maxLinks ) throws IOException {
Charset [ ] detectedcharsetcontainer = new Charset [ ] { null } ;
InputStream sourceStream ;
try {
@ -213,7 +229,7 @@ public class htmlParser extends AbstractParser implements Parser {
}
ContentScraper scraper ; // for this static methode no need to init local this.scraperObject
try {
scraper = parseToScraper ( location , documentCharset , vocabularyScraper, detectedcharsetcontainer , timezoneOffset , sourceStream , maxAnchors , maxLinks , Long . MAX_VALUE ) ;
scraper = parseToScraper ( location , documentCharset , ignore_class_name, vocabularyScraper, detectedcharsetcontainer , timezoneOffset , sourceStream , maxAnchors , maxLinks , Long . MAX_VALUE ) ;
} catch ( Failure e ) {
throw new IOException ( e . getMessage ( ) ) ;
}
@ -238,6 +254,7 @@ public class htmlParser extends AbstractParser implements Parser {
public static ContentScraper parseToScraper (
final DigestURL location ,
final String documentCharset ,
final Set < String > ignore_class_name ,
final VocabularyScraper vocabularyScraper ,
final Charset [ ] detectedcharsetcontainer ,
final int timezoneOffset ,
@ -258,7 +275,7 @@ public class htmlParser extends AbstractParser implements Parser {
if ( charset = = null ) {
ScraperInputStream htmlFilter = null ;
try {
htmlFilter = new ScraperInputStream ( sourceStream , documentCharset , vocabularyScraper, location , null , false , maxLinks , timezoneOffset ) ;
htmlFilter = new ScraperInputStream ( sourceStream , documentCharset , ignore_class_name, vocabularyScraper, location , null , false , maxLinks , timezoneOffset ) ;
sourceStream = htmlFilter ;
charset = htmlFilter . detectCharset ( ) ;
} catch ( final IOException e1 ) {
@ -293,7 +310,7 @@ public class htmlParser extends AbstractParser implements Parser {
// parsing the content
// for this static method no need to init local this.scraperObject here
final ContentScraper scraper = new ContentScraper ( location , maxAnchors , maxLinks , vocabularyScraper, timezoneOffset ) ;
final ContentScraper scraper = new ContentScraper ( location , maxAnchors , maxLinks , ignore_class_name, vocabularyScraper, timezoneOffset ) ;
final TransformerWriter writer = new TransformerWriter ( null , null , scraper , null , false , Math . max ( 64 , Math . min ( 4096 , sourceStream . available ( ) ) ) ) ;
try {
final long maxChars = ( long ) ( maxBytes * detectedcharsetcontainer [ 0 ] . newDecoder ( ) . averageCharsPerByte ( ) ) ;
@ -420,8 +437,10 @@ public class htmlParser extends AbstractParser implements Parser {
* @param maxBytes the maximum number of content bytes to process
* @return document as result of parsed snapshot or null if not exist or on any other issue with snapshot
* /
private Document parseAlternativeSnapshot ( final DigestURL location , final String mimeType , final String documentCharset ,
final VocabularyScraper vocscraper , final int timezoneOffset , final int maxAnchors , final int maxLinks , final long maxBytes ) {
private Document parseAlternativeSnapshot (
final DigestURL location , final String mimeType , final String documentCharset ,
final Set < String > ignore_class_name , final VocabularyScraper vocscraper ,
final int timezoneOffset , final int maxAnchors , final int maxLinks , final long maxBytes ) {
Document documentSnapshot = null ;
try {
// construct url for case (1) with anchor
@ -440,7 +459,7 @@ public class htmlParser extends AbstractParser implements Parser {
InputStream snapshotStream = null ;
try {
snapshotStream = locationSnapshot . getInputStream ( ClientIdentification . yacyInternetCrawlerAgent ) ;
ContentScraper scraperSnapshot = parseToScraper ( location , documentCharset , vocscraper, detectedcharsetcontainer , timezoneOffset , snapshotStream , maxAnchors , maxLinks , maxBytes ) ;
ContentScraper scraperSnapshot = parseToScraper ( location , documentCharset , ignore_class_name, vocscraper, detectedcharsetcontainer , timezoneOffset , snapshotStream , maxAnchors , maxLinks , maxBytes ) ;
documentSnapshot = transformScraper ( location , mimeType , detectedcharsetcontainer [ 0 ] . name ( ) , scraperSnapshot ) ;
} finally {
if ( snapshotStream ! = null ) {