- Redesigned crawler and parser to accept embedded links from the NOLOAD

queue and not from virtual documents generated by the parser.
- The parser now generates nice description texts for NOLOAD entries
which shall make it possible to find media content using the search
index and not using the media prefetch algorithm during search (which
was costly)
- Removed the media-search prefetch process from image search
pull/1/head
Michael Peter Christen 13 years ago
parent 3bea25c513
commit 659178942f

@ -56,7 +56,7 @@ import de.anomic.crawler.retrieval.Request;
public class Balancer { public class Balancer {
private static final String indexSuffix = "9.db"; private static final String indexSuffix = "A.db";
private static final int EcoFSBufferSize = 1000; private static final int EcoFSBufferSize = 1000;
private static final int objectIndexBufferSize = 1000; private static final int objectIndexBufferSize = 1000;
private static final String localhost = "localhost"; private static final String localhost = "localhost";

@ -60,8 +60,8 @@ import de.anomic.crawler.retrieval.Response;
public class CrawlQueues { public class CrawlQueues {
private static final String ERROR_DB_FILENAME = "urlError3.db"; private static final String ERROR_DB_FILENAME = "urlError4.db";
private static final String DELEGATED_DB_FILENAME = "urlDelegated3.db"; private static final String DELEGATED_DB_FILENAME = "urlDelegated4.db";
private static final Segments.Process PROCESS = Segments.Process.LOCALCRAWLING; private static final Segments.Process PROCESS = Segments.Process.LOCALCRAWLING;
protected Switchboard sb; protected Switchboard sb;

@ -159,7 +159,7 @@ public final class HTTPLoader {
// check if the url was already indexed // check if the url was already indexed
final String dbname = this.sb.urlExists(Segments.Process.LOCALCRAWLING, redirectionUrl.hash()); final String dbname = this.sb.urlExists(Segments.Process.LOCALCRAWLING, redirectionUrl.hash());
if (dbname != null) { if (dbname != null) { //OTTO
this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", code); this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", code);
throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " ignored. The url appears already in db " + dbname); throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " ignored. The url appears already in db " + dbname);
} }

@ -53,8 +53,8 @@ public class Request extends WorkflowJob
+ Word.commonHashLength + Word.commonHashLength
+ ", " + ", "
+ // the url's referrer hash + // the url's referrer hash
"String urlname-80, " "String urlname-256, "
+ // the name of the url, from anchor tag <a>name</a> + // the name of the url, from anchor tag <a>name</a> (must be big to transport NOLOAD entries)
"Cardinal appdate-8 {b256}, " "Cardinal appdate-8 {b256}, "
+ // the date of the resource; either file date or first appearance + // the date of the resource; either file date or first appearance
"String profile-" "String profile-"
@ -78,6 +78,8 @@ public class Request extends WorkflowJob
"Cardinal size-8 {b256}", // size of resource in bytes (if known) or 0 if not known "Cardinal size-8 {b256}", // size of resource in bytes (if known) or 0 if not known
Base64Order.enhancedCoder); Base64Order.enhancedCoder);
public final static int descrLength = rowdef.column(4).cellwidth;
private byte[] initiator; // the initiator hash, is NULL or "" if it is the own proxy; private byte[] initiator; // the initiator hash, is NULL or "" if it is the own proxy;
// if this is generated by a crawl, the own peer hash in entered // if this is generated by a crawl, the own peer hash in entered
private byte[] refhash; // the url's referrer hash private byte[] refhash; // the url's referrer hash

@ -162,16 +162,23 @@ public class Response {
this.content = content; this.content = content;
} }
/**
* create a 'virtual' response that is composed using crawl details from the request object
* this is used when the NOLOAD queue is processed
* @param request
* @param profile
*/
public Response(final Request request, final CrawlProfile profile) { public Response(final Request request, final CrawlProfile profile) {
this.request = request; this.request = request;
// request and response headers may be zero in case that we process surrogates // request and response headers may be zero in case that we process surrogates
this.requestHeader = new RequestHeader(); this.requestHeader = new RequestHeader();
this.responseHeader = new ResponseHeader(); this.responseHeader = new ResponseHeader();
this.responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); // tell parser how to handle the content
if (request.size() > 0) this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(request.size())); if (request.size() > 0) this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(request.size()));
this.responseStatus = "200"; this.responseStatus = "200";
this.profile = profile; this.profile = profile;
this.status = QUEUE_STATE_FRESH; this.status = QUEUE_STATE_FRESH;
this.content = request.url().toTokens().getBytes(); this.content = request.name().length() > 0 ? request.name().getBytes() : request.url().toTokens().getBytes();
} }
public Response( public Response(
@ -824,7 +831,7 @@ public class Response {
final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime()); final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime());
if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url()); if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url());
try { try {
return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content, false); return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content);
} catch (final Exception e) { } catch (final Exception e) {
return null; return null;
} }

@ -60,6 +60,7 @@ import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.ByteBuffer; import net.yacy.kelondro.util.ByteBuffer;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
import de.anomic.crawler.retrieval.Request;
public class Document { public class Document {
@ -827,7 +828,8 @@ dc_rights
final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>(); final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
for (final Document d: documents) { for (final Document d: documents) {
for (final ImageEntry imageReference : d.getImages().values()) { for (final ImageEntry imageReference : d.getImages().values()) {
result.put(imageReference.url(), imageReference.alt()); // construct a image name which contains the document title to enhance the search process for images
result.put(imageReference.url(), description(d, imageReference.alt()));
} }
} }
return result; return result;
@ -835,20 +837,57 @@ dc_rights
public static Map<MultiProtocolURI, String> getAudiolinks(final Document[] documents) { public static Map<MultiProtocolURI, String> getAudiolinks(final Document[] documents) {
final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>(); final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
for (final Document d: documents) result.putAll(d.audiolinks); for (final Document d: documents) {
for (Map.Entry<MultiProtocolURI, String> e: d.audiolinks.entrySet()) {
result.put(e.getKey(), description(d, e.getValue()));
}
}
return result; return result;
} }
public static Map<MultiProtocolURI, String> getVideolinks(final Document[] documents) { public static Map<MultiProtocolURI, String> getVideolinks(final Document[] documents) {
final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>(); final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
for (final Document d: documents) result.putAll(d.videolinks); for (final Document d: documents) {
for (Map.Entry<MultiProtocolURI, String> e: d.videolinks.entrySet()) {
result.put(e.getKey(), description(d, e.getValue()));
}
}
return result; return result;
} }
public static Map<MultiProtocolURI, String> getApplinks(final Document[] documents) { public static Map<MultiProtocolURI, String> getApplinks(final Document[] documents) {
final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>(); final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
for (final Document d: documents) result.putAll(d.applinks); for (final Document d: documents) {
for (Map.Entry<MultiProtocolURI, String> e: d.applinks.entrySet()) {
result.put(e.getKey(), description(d, e.getValue()));
}
}
return result; return result;
} }
private static final String description(Document d, String tagname) {
if (tagname == null || tagname.length() == 0) {
tagname = d.source.toTokens();
}
StringBuilder sb = new StringBuilder(60);
sb.append(d.dc_title());
if (!d.dc_description().equals(d.dc_title()) && sb.length() < Request.descrLength - tagname.length()) {
sb.append(' ');
sb.append(d.dc_description());
}
if (sb.length() < Request.descrLength - tagname.length()) {
sb.append(' ');
sb.append(d.dc_subject(','));
}
if (tagname.length() > 0) {
if (sb.length() > Request.descrLength - tagname.length() - 3) {
// cut this off because otherwise the tagname is lost.
sb.setLength(Request.descrLength - tagname.length() - 3);
}
sb.append(" - ");
sb.append(tagname);
}
return sb.toString().trim();
}
} }

@ -31,12 +31,11 @@ import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.List; import java.util.List;
import java.util.Map; import java.util.Map;
import java.util.Map.Entry;
import java.util.Set; import java.util.Set;
import java.util.concurrent.ConcurrentHashMap; import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.Classification;
import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.document.parser.bzipParser; import net.yacy.document.parser.bzipParser;
import net.yacy.document.parser.csvParser; import net.yacy.document.parser.csvParser;
import net.yacy.document.parser.docParser; import net.yacy.document.parser.docParser;
@ -60,7 +59,6 @@ import net.yacy.document.parser.vcfParser;
import net.yacy.document.parser.vsdParser; import net.yacy.document.parser.vsdParser;
import net.yacy.document.parser.xlsParser; import net.yacy.document.parser.xlsParser;
import net.yacy.document.parser.zipParser; import net.yacy.document.parser.zipParser;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.images.genericImageParser; import net.yacy.document.parser.images.genericImageParser;
import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils; import net.yacy.kelondro.util.FileUtils;
@ -144,8 +142,7 @@ public final class TextParser {
final MultiProtocolURI location, final MultiProtocolURI location,
final String mimeType, final String mimeType,
final String charset, final String charset,
final File sourceFile, final File sourceFile
final boolean multipleVirtualDocs
) throws InterruptedException, Parser.Failure { ) throws InterruptedException, Parser.Failure {
BufferedInputStream sourceStream = null; BufferedInputStream sourceStream = null;
@ -158,7 +155,7 @@ public final class TextParser {
throw new Parser.Failure(errorMsg, location); throw new Parser.Failure(errorMsg, location);
} }
sourceStream = new BufferedInputStream(new FileInputStream(sourceFile)); sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
docs = parseSource(location, mimeType, charset, sourceFile.length(), sourceStream, multipleVirtualDocs); docs = parseSource(location, mimeType, charset, sourceFile.length(), sourceStream);
} catch (final Exception e) { } catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@ -176,8 +173,7 @@ public final class TextParser {
final MultiProtocolURI location, final MultiProtocolURI location,
String mimeType, String mimeType,
final String charset, final String charset,
final byte[] content, final byte[] content
final boolean multipleVirtualDocs
) throws Parser.Failure { ) throws Parser.Failure {
if (log.isFine()) log.logFine("Parsing '" + location + "' from byte-array"); if (log.isFine()) log.logFine("Parsing '" + location + "' from byte-array");
mimeType = normalizeMimeType(mimeType); mimeType = normalizeMimeType(mimeType);
@ -193,9 +189,6 @@ public final class TextParser {
Document[] docs = parseSource(location, mimeType, idioms, charset, content); Document[] docs = parseSource(location, mimeType, idioms, charset, content);
// finally enrich the docs set with virtual docs from the enclosed documents
if (multipleVirtualDocs && docs.length == 1) docs = virtualDocs(docs[0]);
return docs; return docs;
} }
@ -204,8 +197,7 @@ public final class TextParser {
String mimeType, String mimeType,
final String charset, final String charset,
final long contentLength, final long contentLength,
final InputStream sourceStream, final InputStream sourceStream
final boolean multipleVirtualDocs
) throws Parser.Failure { ) throws Parser.Failure {
if (log.isFine()) log.logFine("Parsing '" + location + "' from stream"); if (log.isFine()) log.logFine("Parsing '" + location + "' from stream");
mimeType = normalizeMimeType(mimeType); mimeType = normalizeMimeType(mimeType);
@ -236,9 +228,6 @@ public final class TextParser {
} }
Document[] docs = parseSource(location, mimeType, idioms, charset, b); Document[] docs = parseSource(location, mimeType, idioms, charset, b);
// finally enrich the docs set with virtual docs from the enclosed documents
if (multipleVirtualDocs && docs.length == 1) docs = virtualDocs(docs[0]);
return docs; return docs;
} }
@ -281,7 +270,13 @@ public final class TextParser {
final HashMap<Parser, Parser.Failure> failedParser = new HashMap<Parser, Parser.Failure>(); final HashMap<Parser, Parser.Failure> failedParser = new HashMap<Parser, Parser.Failure>();
if (MemoryControl.request(sourceArray.length * 6, false)) { if (MemoryControl.request(sourceArray.length * 6, false)) {
for (final Parser parser: parsers) { for (final Parser parser: parsers) {
ByteArrayInputStream bis = new ByteArrayInputStream(sourceArray); ByteArrayInputStream bis;
if (mimeType.equals("text/plain") && parser.getName().equals("HTML Parser")) {
// a hack to simulate html files .. is needed for NOLOAD queues. This throws their data into virtual text/plain messages.
bis = new ByteArrayInputStream(UTF8.getBytes("<html><head></head><body><h1>" + UTF8.String(sourceArray) + "</h1></body><html>"));
} else {
bis = new ByteArrayInputStream(sourceArray);
}
try { try {
docs = parser.parse(location, mimeType, documentCharset, bis); docs = parser.parse(location, mimeType, documentCharset, bis);
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {
@ -477,73 +472,4 @@ public final class TextParser {
if (grant) denyExtensionx.remove(ext); else denyExtensionx.put(ext, v); if (grant) denyExtensionx.remove(ext); else denyExtensionx.put(ext, v);
} }
/**
* produce virtual documents for each of the link that is contained in the document
* @param document
* @return
*/
public static Document[] virtualDocs(final Document document) {
final ArrayList<Document> docs = new ArrayList<Document>();
docs.add(document);
for (final Map.Entry<MultiProtocolURI, String> link: document.getApplinks().entrySet()) {
docs.add(genLinkDocs("application", link.getKey(), link.getValue(), document.getContentLanguages()));
}
for (final Map.Entry<MultiProtocolURI, String> link: document.getAudiolinks().entrySet()) {
docs.add(genLinkDocs("audio", link.getKey(), link.getValue(), document.getContentLanguages()));
}
for (final Map.Entry<MultiProtocolURI, String> link: document.getVideolinks().entrySet()) {
docs.add(genLinkDocs("video", link.getKey(), link.getValue(), document.getContentLanguages()));
}
for (final Entry<MultiProtocolURI, ImageEntry> link: document.getImages().entrySet()) {
docs.add(genImageDocs(link.getValue()));
}
// finally return the list of documents
return docs.toArray(new Document[docs.size()]);
}
private final static Document genLinkDocs(final String type, final MultiProtocolURI uri, final String descr, final Set<String> contentLanguages) {
//System.out.println("HTMLPARSER-LINK " + type + ": " + uri.toNormalform(true, false) + " / " + descr);
return new Document(
uri,
Classification.ext2mime(uri.getFileExtension()),
"UTF-8",
null,
contentLanguages,
null,
descr,
"",
"",
new String[]{descr},
type,
0.0f, 0.0f,
uri.toNormalform(false, false),
null,
null,
null,
false);
}
private final static Document genImageDocs(final ImageEntry img) {
//System.out.println("HTMLPARSER-LINK image: " + img.url().toNormalform(true, false) + " / " + img.alt());
return new Document(
img.url(),
Classification.ext2mime(img.url().getFileExtension()),
"UTF-8",
null,
null,
null,
img.alt(),
"",
"",
new String[]{img.alt()},
"image",
0.0f, 0.0f,
img.url().toNormalform(false, false),
null,
null,
null,
false);
}
} }

@ -101,14 +101,17 @@ public class MediawikiImporter extends Thread implements Importer {
this.urlStub = null; this.urlStub = null;
} }
@Override
public int count() { public int count() {
return this.count; return this.count;
} }
@Override
public String source() { public String source() {
return this.sourcefile.getAbsolutePath(); return this.sourcefile.getAbsolutePath();
} }
@Override
public String status() { public String status() {
return ""; return "";
} }
@ -117,6 +120,7 @@ public class MediawikiImporter extends Thread implements Importer {
* return the number of articles per second * return the number of articles per second
* @return * @return
*/ */
@Override
public int speed() { public int speed() {
if (this.count == 0) return 0; if (this.count == 0) return 0;
return (int) (this.count / Math.max(1L, runningTime() )); return (int) (this.count / Math.max(1L, runningTime() ));
@ -126,14 +130,17 @@ public class MediawikiImporter extends Thread implements Importer {
* return the remaining seconds for the completion of all records in milliseconds * return the remaining seconds for the completion of all records in milliseconds
* @return * @return
*/ */
@Override
public long remainingTime() { public long remainingTime() {
return Math.max(0, this.approxdocs - this.count) / Math.max(1, speed() ); return Math.max(0, this.approxdocs - this.count) / Math.max(1, speed() );
} }
@Override
public long runningTime() { public long runningTime() {
return (System.currentTimeMillis() - this.start) / 1000L; return (System.currentTimeMillis() - this.start) / 1000L;
} }
@Override
public void run() { public void run() {
this.start = System.currentTimeMillis(); this.start = System.currentTimeMillis();
try { try {
@ -287,6 +294,7 @@ public class MediawikiImporter extends Thread implements Importer {
this.mediawikixml = mediawikixml; this.mediawikixml = mediawikixml;
} }
@Override
public void run() { public void run() {
try { try {
createIndex(this.mediawikixml); createIndex(this.mediawikixml);
@ -365,6 +373,7 @@ public class MediawikiImporter extends Thread implements Importer {
} }
} }
@Override
public Integer call() { public Integer call() {
wikisourcerecord r; wikisourcerecord r;
try { try {
@ -412,6 +421,7 @@ public class MediawikiImporter extends Thread implements Importer {
} }
} }
@Override
public Integer call() { public Integer call() {
wikisourcerecord r; wikisourcerecord r;
wikiraw c; wikiraw c;
@ -505,7 +515,7 @@ public class MediawikiImporter extends Thread implements Importer {
public void genDocument() throws Parser.Failure { public void genDocument() throws Parser.Failure {
try { try {
this.url = new DigestURI(this.urlStub + this.title); this.url = new DigestURI(this.urlStub + this.title);
final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", UTF8.getBytes(this.html), false); final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", UTF8.getBytes(this.html));
this.document = Document.mergeDocuments(this.url, "text/html", parsed); this.document = Document.mergeDocuments(this.url, "text/html", parsed);
// the wiki parser is not able to find the proper title in the source text, so it must be set here // the wiki parser is not able to find the proper title in the source text, so it must be set here
this.document.setTitle(this.title); this.document.setTitle(this.title);
@ -626,6 +636,7 @@ public class MediawikiImporter extends Thread implements Importer {
this.out = out; this.out = out;
} }
@Override
public Integer call() { public Integer call() {
wikiparserrecord record; wikiparserrecord record;
try { try {
@ -682,6 +693,7 @@ public class MediawikiImporter extends Thread implements Importer {
this.outputfilename = null; this.outputfilename = null;
} }
@Override
public Integer call() { public Integer call() {
wikiparserrecord record; wikiparserrecord record;
try { try {

@ -55,6 +55,7 @@ public class bzipParser extends AbstractParser implements Parser {
this.SUPPORTED_MIME_TYPES.add("application/x-stuffit"); this.SUPPORTED_MIME_TYPES.add("application/x-stuffit");
} }
@Override
public Document[] parse(final MultiProtocolURI location, final String mimeType, public Document[] parse(final MultiProtocolURI location, final String mimeType,
final String charset, final InputStream source) final String charset, final InputStream source)
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
@ -93,7 +94,7 @@ public class bzipParser extends AbstractParser implements Parser {
out.close(); out.close();
// creating a new parser class to parse the unzipped content // creating a new parser class to parse the unzipped content
docs = TextParser.parseSource(location, null, null, tempFile, false); docs = TextParser.parseSource(location, null, null, tempFile);
} catch (final Exception e) { } catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e;

@ -54,6 +54,7 @@ public class gzipParser extends AbstractParser implements Parser {
this.SUPPORTED_MIME_TYPES.add("gzip/document"); this.SUPPORTED_MIME_TYPES.add("gzip/document");
} }
@Override
public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
File tempFile = null; File tempFile = null;
@ -78,7 +79,7 @@ public class gzipParser extends AbstractParser implements Parser {
out.close(); out.close();
// creating a new parser class to parse the unzipped content // creating a new parser class to parse the unzipped content
docs = TextParser.parseSource(location,null,null,tempFile, false); docs = TextParser.parseSource(location,null,null,tempFile);
} catch (final Exception e) { } catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e;

@ -99,6 +99,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
} }
} }
@Override
public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset,
final InputStream source) throws Parser.Failure, InterruptedException { final InputStream source) throws Parser.Failure, InterruptedException {
try { try {
@ -166,7 +167,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
// below for reversion of the effects // below for reversion of the effects
final MultiProtocolURI url = MultiProtocolURI.newURL(this.doc.dc_source(), this.prefix + "/" + super.filePath); final MultiProtocolURI url = MultiProtocolURI.newURL(this.doc.dc_source(), this.prefix + "/" + super.filePath);
final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1)); final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray(), false); theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray());
this.doc.addSubDocuments(theDocs); this.doc.addSubDocuments(theDocs);
} }

@ -59,6 +59,7 @@ public class tarParser extends AbstractParser implements Parser {
this.SUPPORTED_MIME_TYPES.add("multipart/x-tar"); this.SUPPORTED_MIME_TYPES.add("multipart/x-tar");
} }
@Override
public Document[] parse(final MultiProtocolURI url, final String mimeType, final String charset, InputStream source) throws Parser.Failure, InterruptedException { public Document[] parse(final MultiProtocolURI url, final String mimeType, final String charset, InputStream source) throws Parser.Failure, InterruptedException {
final List<Document> docacc = new ArrayList<Document>(); final List<Document> docacc = new ArrayList<Document>();
@ -88,7 +89,7 @@ public class tarParser extends AbstractParser implements Parser {
try { try {
tmp = FileUtils.createTempFile(this.getClass(), name); tmp = FileUtils.createTempFile(this.getClass(), name);
FileUtils.copy(tis, tmp, entry.getSize()); FileUtils.copy(tis, tmp, entry.getSize());
subDocs = TextParser.parseSource(MultiProtocolURI.newURL(url,"#" + name), mime, null, tmp, false); subDocs = TextParser.parseSource(MultiProtocolURI.newURL(url,"#" + name), mime, null, tmp);
if (subDocs == null) continue; if (subDocs == null) continue;
for (final Document d: subDocs) docacc.add(d); for (final Document d: subDocs) docacc.add(d);
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {

@ -59,6 +59,7 @@ public class zipParser extends AbstractParser implements Parser {
this.SUPPORTED_MIME_TYPES.add("application/vnd.android.package-archive"); this.SUPPORTED_MIME_TYPES.add("application/vnd.android.package-archive");
} }
@Override
public Document[] parse(final MultiProtocolURI url, final String mimeType, public Document[] parse(final MultiProtocolURI url, final String mimeType,
final String charset, final InputStream source) final String charset, final InputStream source)
throws Parser.Failure, InterruptedException { throws Parser.Failure, InterruptedException {
@ -87,7 +88,7 @@ public class zipParser extends AbstractParser implements Parser {
FileUtils.copy(zis, tmp, entry.getSize()); FileUtils.copy(zis, tmp, entry.getSize());
final MultiProtocolURI virtualURL = MultiProtocolURI.newURL(url, "#" + name); final MultiProtocolURI virtualURL = MultiProtocolURI.newURL(url, "#" + name);
//this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false)); //this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false));
docs = TextParser.parseSource(virtualURL, mime, null, tmp, false); docs = TextParser.parseSource(virtualURL, mime, null, tmp);
if (docs == null) continue; if (docs == null) continue;
for (final Document d: docs) docacc.add(d); for (final Document d: docs) docacc.add(d);
} catch (final Parser.Failure e) { } catch (final Parser.Failure e) {

@ -386,7 +386,7 @@ public final class LoaderDispatcher {
final String supportError = TextParser.supports(url, responseHeader.mime()); final String supportError = TextParser.supports(url, responseHeader.mime());
if (supportError != null) throw new IOException("no parser support: " + supportError); if (supportError != null) throw new IOException("no parser support: " + supportError);
try { try {
documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.getContent(), false); documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.getContent());
if (documents == null) throw new IOException("document == null"); if (documents == null) throw new IOException("document == null");
} catch (final Exception e) { } catch (final Exception e) {
throw new IOException("parser error: " + e.getMessage()); throw new IOException("parser error: " + e.getMessage());

@ -2329,8 +2329,7 @@ public final class Switchboard extends serverSwitch
response.url(), response.url(),
response.getMimeType(), response.getMimeType(),
response.getCharacterEncoding(), response.getCharacterEncoding(),
response.getContent(), response.getContent());
response.profile().directDocByURL());
if ( documents == null ) { if ( documents == null ) {
throw new Parser.Failure("Parser returned null.", response.url()); throw new Parser.Failure("Parser returned null.", response.url());
} }

@ -150,7 +150,7 @@ public class DocumentIndex extends Segment
length = -1; length = -1;
} }
try { try {
documents = TextParser.parseSource(url, null, null, length, url.getInputStream(null, -1), true); documents = TextParser.parseSource(url, null, null, length, url.getInputStream(null, -1));
} catch ( final Exception e ) { } catch ( final Exception e ) {
throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage()); throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage());
} }

Loading…
Cancel
Save