@ -31,12 +31,11 @@ import java.util.HashMap;
import java.util.HashSet ;
import java.util.List ;
import java.util.Map ;
import java.util.Map.Entry ;
import java.util.Set ;
import java.util.concurrent.ConcurrentHashMap ;
import net.yacy.cora.document.Classification ;
import net.yacy.cora.document.MultiProtocolURI ;
import net.yacy.cora.document.UTF8 ;
import net.yacy.document.parser.bzipParser ;
import net.yacy.document.parser.csvParser ;
import net.yacy.document.parser.docParser ;
@ -60,7 +59,6 @@ import net.yacy.document.parser.vcfParser;
import net.yacy.document.parser.vsdParser ;
import net.yacy.document.parser.xlsParser ;
import net.yacy.document.parser.zipParser ;
import net.yacy.document.parser.html.ImageEntry ;
import net.yacy.document.parser.images.genericImageParser ;
import net.yacy.kelondro.logging.Log ;
import net.yacy.kelondro.util.FileUtils ;
@ -144,8 +142,7 @@ public final class TextParser {
final MultiProtocolURI location ,
final String mimeType ,
final String charset ,
final File sourceFile ,
final boolean multipleVirtualDocs
final File sourceFile
) throws InterruptedException , Parser . Failure {
BufferedInputStream sourceStream = null ;
@ -158,7 +155,7 @@ public final class TextParser {
throw new Parser . Failure ( errorMsg , location ) ;
}
sourceStream = new BufferedInputStream ( new FileInputStream ( sourceFile ) ) ;
docs = parseSource ( location , mimeType , charset , sourceFile . length ( ) , sourceStream , multipleVirtualDocs );
docs = parseSource ( location , mimeType , charset , sourceFile . length ( ) , sourceStream );
} catch ( final Exception e ) {
if ( e instanceof InterruptedException ) throw ( InterruptedException ) e ;
if ( e instanceof Parser . Failure ) throw ( Parser . Failure ) e ;
@ -176,8 +173,7 @@ public final class TextParser {
final MultiProtocolURI location ,
String mimeType ,
final String charset ,
final byte [ ] content ,
final boolean multipleVirtualDocs
final byte [ ] content
) throws Parser . Failure {
if ( log . isFine ( ) ) log . logFine ( "Parsing '" + location + "' from byte-array" ) ;
mimeType = normalizeMimeType ( mimeType ) ;
@ -193,9 +189,6 @@ public final class TextParser {
Document [ ] docs = parseSource ( location , mimeType , idioms , charset , content ) ;
// finally enrich the docs set with virtual docs from the enclosed documents
if ( multipleVirtualDocs & & docs . length = = 1 ) docs = virtualDocs ( docs [ 0 ] ) ;
return docs ;
}
@ -204,8 +197,7 @@ public final class TextParser {
String mimeType ,
final String charset ,
final long contentLength ,
final InputStream sourceStream ,
final boolean multipleVirtualDocs
final InputStream sourceStream
) throws Parser . Failure {
if ( log . isFine ( ) ) log . logFine ( "Parsing '" + location + "' from stream" ) ;
mimeType = normalizeMimeType ( mimeType ) ;
@ -236,9 +228,6 @@ public final class TextParser {
}
Document [ ] docs = parseSource ( location , mimeType , idioms , charset , b ) ;
// finally enrich the docs set with virtual docs from the enclosed documents
if ( multipleVirtualDocs & & docs . length = = 1 ) docs = virtualDocs ( docs [ 0 ] ) ;
return docs ;
}
@ -281,7 +270,13 @@ public final class TextParser {
final HashMap < Parser , Parser . Failure > failedParser = new HashMap < Parser , Parser . Failure > ( ) ;
if ( MemoryControl . request ( sourceArray . length * 6 , false ) ) {
for ( final Parser parser : parsers ) {
ByteArrayInputStream bis = new ByteArrayInputStream ( sourceArray ) ;
ByteArrayInputStream bis ;
if ( mimeType . equals ( "text/plain" ) & & parser . getName ( ) . equals ( "HTML Parser" ) ) {
// a hack to simulate html files .. is needed for NOLOAD queues. This throws their data into virtual text/plain messages.
bis = new ByteArrayInputStream ( UTF8 . getBytes ( "<html><head></head><body><h1>" + UTF8 . String ( sourceArray ) + "</h1></body><html>" ) ) ;
} else {
bis = new ByteArrayInputStream ( sourceArray ) ;
}
try {
docs = parser . parse ( location , mimeType , documentCharset , bis ) ;
} catch ( final Parser . Failure e ) {
@ -477,73 +472,4 @@ public final class TextParser {
if ( grant ) denyExtensionx . remove ( ext ) ; else denyExtensionx . put ( ext , v ) ;
}
/ * *
* produce virtual documents for each of the link that is contained in the document
* @param document
* @return
* /
public static Document [ ] virtualDocs ( final Document document ) {
final ArrayList < Document > docs = new ArrayList < Document > ( ) ;
docs . add ( document ) ;
for ( final Map . Entry < MultiProtocolURI , String > link : document . getApplinks ( ) . entrySet ( ) ) {
docs . add ( genLinkDocs ( "application" , link . getKey ( ) , link . getValue ( ) , document . getContentLanguages ( ) ) ) ;
}
for ( final Map . Entry < MultiProtocolURI , String > link : document . getAudiolinks ( ) . entrySet ( ) ) {
docs . add ( genLinkDocs ( "audio" , link . getKey ( ) , link . getValue ( ) , document . getContentLanguages ( ) ) ) ;
}
for ( final Map . Entry < MultiProtocolURI , String > link : document . getVideolinks ( ) . entrySet ( ) ) {
docs . add ( genLinkDocs ( "video" , link . getKey ( ) , link . getValue ( ) , document . getContentLanguages ( ) ) ) ;
}
for ( final Entry < MultiProtocolURI , ImageEntry > link : document . getImages ( ) . entrySet ( ) ) {
docs . add ( genImageDocs ( link . getValue ( ) ) ) ;
}
// finally return the list of documents
return docs . toArray ( new Document [ docs . size ( ) ] ) ;
}
private final static Document genLinkDocs ( final String type , final MultiProtocolURI uri , final String descr , final Set < String > contentLanguages ) {
//System.out.println("HTMLPARSER-LINK " + type + ": " + uri.toNormalform(true, false) + " / " + descr);
return new Document (
uri ,
Classification . ext2mime ( uri . getFileExtension ( ) ) ,
"UTF-8" ,
null ,
contentLanguages ,
null ,
descr ,
"" ,
"" ,
new String [ ] { descr } ,
type ,
0.0f , 0.0f ,
uri . toNormalform ( false , false ) ,
null ,
null ,
null ,
false ) ;
}
private final static Document genImageDocs ( final ImageEntry img ) {
//System.out.println("HTMLPARSER-LINK image: " + img.url().toNormalform(true, false) + " / " + img.alt());
return new Document (
img . url ( ) ,
Classification . ext2mime ( img . url ( ) . getFileExtension ( ) ) ,
"UTF-8" ,
null ,
null ,
null ,
img . alt ( ) ,
"" ,
"" ,
new String [ ] { img . alt ( ) } ,
"image" ,
0.0f , 0.0f ,
img . url ( ) . toNormalform ( false , false ) ,
null ,
null ,
null ,
false ) ;
}
}