From 659178942f2918e8150e70b209364425575b5a53 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 24 Apr 2012 16:07:03 +0200 Subject: [PATCH] - Redesigned crawler and parser to accept embedded links from the NOLOAD queue and not from virtual documents generated by the parser. - The parser now generates nice description texts for NOLOAD entries which shall make it possible to find media content using the search index and not using the media prefetch algorithm during search (which was costly) - Removed the media-search prefetch process from image search --- source/de/anomic/crawler/Balancer.java | 2 +- source/de/anomic/crawler/CrawlQueues.java | 4 +- .../anomic/crawler/retrieval/HTTPLoader.java | 2 +- .../de/anomic/crawler/retrieval/Request.java | 6 +- .../de/anomic/crawler/retrieval/Response.java | 11 ++- source/net/yacy/document/Document.java | 47 ++++++++- source/net/yacy/document/TextParser.java | 98 +++---------------- .../document/importer/MediawikiImporter.java | 14 ++- .../net/yacy/document/parser/bzipParser.java | 3 +- .../net/yacy/document/parser/gzipParser.java | 3 +- .../yacy/document/parser/sevenzipParser.java | 3 +- .../net/yacy/document/parser/tarParser.java | 7 +- .../net/yacy/document/parser/zipParser.java | 3 +- .../net/yacy/repository/LoaderDispatcher.java | 2 +- source/net/yacy/search/Switchboard.java | 3 +- .../net/yacy/search/index/DocumentIndex.java | 2 +- 16 files changed, 100 insertions(+), 110 deletions(-) diff --git a/source/de/anomic/crawler/Balancer.java b/source/de/anomic/crawler/Balancer.java index b0b2deca2..85f9d24f2 100644 --- a/source/de/anomic/crawler/Balancer.java +++ b/source/de/anomic/crawler/Balancer.java @@ -56,7 +56,7 @@ import de.anomic.crawler.retrieval.Request; public class Balancer { - private static final String indexSuffix = "9.db"; + private static final String indexSuffix = "A.db"; private static final int EcoFSBufferSize = 1000; private static final int objectIndexBufferSize = 1000; private static final String localhost = "localhost"; diff --git a/source/de/anomic/crawler/CrawlQueues.java b/source/de/anomic/crawler/CrawlQueues.java index 6dd7d865a..2eb4231dc 100644 --- a/source/de/anomic/crawler/CrawlQueues.java +++ b/source/de/anomic/crawler/CrawlQueues.java @@ -60,8 +60,8 @@ import de.anomic.crawler.retrieval.Response; public class CrawlQueues { - private static final String ERROR_DB_FILENAME = "urlError3.db"; - private static final String DELEGATED_DB_FILENAME = "urlDelegated3.db"; + private static final String ERROR_DB_FILENAME = "urlError4.db"; + private static final String DELEGATED_DB_FILENAME = "urlDelegated4.db"; private static final Segments.Process PROCESS = Segments.Process.LOCALCRAWLING; protected Switchboard sb; diff --git a/source/de/anomic/crawler/retrieval/HTTPLoader.java b/source/de/anomic/crawler/retrieval/HTTPLoader.java index 1b98dd41d..3e5d81a2f 100644 --- a/source/de/anomic/crawler/retrieval/HTTPLoader.java +++ b/source/de/anomic/crawler/retrieval/HTTPLoader.java @@ -159,7 +159,7 @@ public final class HTTPLoader { // check if the url was already indexed final String dbname = this.sb.urlExists(Segments.Process.LOCALCRAWLING, redirectionUrl.hash()); - if (dbname != null) { + if (dbname != null) { //OTTO this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", code); throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " ignored. The url appears already in db " + dbname); } diff --git a/source/de/anomic/crawler/retrieval/Request.java b/source/de/anomic/crawler/retrieval/Request.java index 33eb8e1c9..eb5bf09bd 100644 --- a/source/de/anomic/crawler/retrieval/Request.java +++ b/source/de/anomic/crawler/retrieval/Request.java @@ -53,8 +53,8 @@ public class Request extends WorkflowJob + Word.commonHashLength + ", " + // the url's referrer hash - "String urlname-80, " - + // the name of the url, from anchor tag name + "String urlname-256, " + + // the name of the url, from anchor tag name (must be big to transport NOLOAD entries) "Cardinal appdate-8 {b256}, " + // the date of the resource; either file date or first appearance "String profile-" @@ -78,6 +78,8 @@ public class Request extends WorkflowJob "Cardinal size-8 {b256}", // size of resource in bytes (if known) or 0 if not known Base64Order.enhancedCoder); + public final static int descrLength = rowdef.column(4).cellwidth; + private byte[] initiator; // the initiator hash, is NULL or "" if it is the own proxy; // if this is generated by a crawl, the own peer hash in entered private byte[] refhash; // the url's referrer hash diff --git a/source/de/anomic/crawler/retrieval/Response.java b/source/de/anomic/crawler/retrieval/Response.java index aeaa8328e..a3d0eaa8b 100644 --- a/source/de/anomic/crawler/retrieval/Response.java +++ b/source/de/anomic/crawler/retrieval/Response.java @@ -162,16 +162,23 @@ public class Response { this.content = content; } + /** + * create a 'virtual' response that is composed using crawl details from the request object + * this is used when the NOLOAD queue is processed + * @param request + * @param profile + */ public Response(final Request request, final CrawlProfile profile) { this.request = request; // request and response headers may be zero in case that we process surrogates this.requestHeader = new RequestHeader(); this.responseHeader = new ResponseHeader(); + this.responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); // tell parser how to handle the content if (request.size() > 0) this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(request.size())); this.responseStatus = "200"; this.profile = profile; this.status = QUEUE_STATE_FRESH; - this.content = request.url().toTokens().getBytes(); + this.content = request.name().length() > 0 ? request.name().getBytes() : request.url().toTokens().getBytes(); } public Response( @@ -824,7 +831,7 @@ public class Response { final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime()); if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url()); try { - return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content, false); + return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content); } catch (final Exception e) { return null; } diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index dc2cb8be8..ce9946269 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -60,6 +60,7 @@ import net.yacy.document.parser.html.ImageEntry; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.ByteBuffer; import net.yacy.kelondro.util.FileUtils; +import de.anomic.crawler.retrieval.Request; public class Document { @@ -827,7 +828,8 @@ dc_rights final Map result = new HashMap(); for (final Document d: documents) { for (final ImageEntry imageReference : d.getImages().values()) { - result.put(imageReference.url(), imageReference.alt()); + // construct a image name which contains the document title to enhance the search process for images + result.put(imageReference.url(), description(d, imageReference.alt())); } } return result; @@ -835,20 +837,57 @@ dc_rights public static Map getAudiolinks(final Document[] documents) { final Map result = new HashMap(); - for (final Document d: documents) result.putAll(d.audiolinks); + for (final Document d: documents) { + for (Map.Entry e: d.audiolinks.entrySet()) { + result.put(e.getKey(), description(d, e.getValue())); + } + } return result; } public static Map getVideolinks(final Document[] documents) { final Map result = new HashMap(); - for (final Document d: documents) result.putAll(d.videolinks); + for (final Document d: documents) { + for (Map.Entry e: d.videolinks.entrySet()) { + result.put(e.getKey(), description(d, e.getValue())); + } + } return result; } public static Map getApplinks(final Document[] documents) { final Map result = new HashMap(); - for (final Document d: documents) result.putAll(d.applinks); + for (final Document d: documents) { + for (Map.Entry e: d.applinks.entrySet()) { + result.put(e.getKey(), description(d, e.getValue())); + } + } return result; } + private static final String description(Document d, String tagname) { + if (tagname == null || tagname.length() == 0) { + tagname = d.source.toTokens(); + } + StringBuilder sb = new StringBuilder(60); + sb.append(d.dc_title()); + if (!d.dc_description().equals(d.dc_title()) && sb.length() < Request.descrLength - tagname.length()) { + sb.append(' '); + sb.append(d.dc_description()); + } + if (sb.length() < Request.descrLength - tagname.length()) { + sb.append(' '); + sb.append(d.dc_subject(',')); + } + if (tagname.length() > 0) { + if (sb.length() > Request.descrLength - tagname.length() - 3) { + // cut this off because otherwise the tagname is lost. + sb.setLength(Request.descrLength - tagname.length() - 3); + } + sb.append(" - "); + sb.append(tagname); + } + return sb.toString().trim(); + } + } diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index a6ab4c812..958d7d943 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -31,12 +31,11 @@ import java.util.HashMap; import java.util.HashSet; import java.util.List; import java.util.Map; -import java.util.Map.Entry; import java.util.Set; import java.util.concurrent.ConcurrentHashMap; -import net.yacy.cora.document.Classification; import net.yacy.cora.document.MultiProtocolURI; +import net.yacy.cora.document.UTF8; import net.yacy.document.parser.bzipParser; import net.yacy.document.parser.csvParser; import net.yacy.document.parser.docParser; @@ -60,7 +59,6 @@ import net.yacy.document.parser.vcfParser; import net.yacy.document.parser.vsdParser; import net.yacy.document.parser.xlsParser; import net.yacy.document.parser.zipParser; -import net.yacy.document.parser.html.ImageEntry; import net.yacy.document.parser.images.genericImageParser; import net.yacy.kelondro.logging.Log; import net.yacy.kelondro.util.FileUtils; @@ -144,8 +142,7 @@ public final class TextParser { final MultiProtocolURI location, final String mimeType, final String charset, - final File sourceFile, - final boolean multipleVirtualDocs + final File sourceFile ) throws InterruptedException, Parser.Failure { BufferedInputStream sourceStream = null; @@ -158,7 +155,7 @@ public final class TextParser { throw new Parser.Failure(errorMsg, location); } sourceStream = new BufferedInputStream(new FileInputStream(sourceFile)); - docs = parseSource(location, mimeType, charset, sourceFile.length(), sourceStream, multipleVirtualDocs); + docs = parseSource(location, mimeType, charset, sourceFile.length(), sourceStream); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; @@ -176,8 +173,7 @@ public final class TextParser { final MultiProtocolURI location, String mimeType, final String charset, - final byte[] content, - final boolean multipleVirtualDocs + final byte[] content ) throws Parser.Failure { if (log.isFine()) log.logFine("Parsing '" + location + "' from byte-array"); mimeType = normalizeMimeType(mimeType); @@ -193,9 +189,6 @@ public final class TextParser { Document[] docs = parseSource(location, mimeType, idioms, charset, content); - // finally enrich the docs set with virtual docs from the enclosed documents - if (multipleVirtualDocs && docs.length == 1) docs = virtualDocs(docs[0]); - return docs; } @@ -204,8 +197,7 @@ public final class TextParser { String mimeType, final String charset, final long contentLength, - final InputStream sourceStream, - final boolean multipleVirtualDocs + final InputStream sourceStream ) throws Parser.Failure { if (log.isFine()) log.logFine("Parsing '" + location + "' from stream"); mimeType = normalizeMimeType(mimeType); @@ -236,9 +228,6 @@ public final class TextParser { } Document[] docs = parseSource(location, mimeType, idioms, charset, b); - // finally enrich the docs set with virtual docs from the enclosed documents - if (multipleVirtualDocs && docs.length == 1) docs = virtualDocs(docs[0]); - return docs; } @@ -281,7 +270,13 @@ public final class TextParser { final HashMap failedParser = new HashMap(); if (MemoryControl.request(sourceArray.length * 6, false)) { for (final Parser parser: parsers) { - ByteArrayInputStream bis = new ByteArrayInputStream(sourceArray); + ByteArrayInputStream bis; + if (mimeType.equals("text/plain") && parser.getName().equals("HTML Parser")) { + // a hack to simulate html files .. is needed for NOLOAD queues. This throws their data into virtual text/plain messages. + bis = new ByteArrayInputStream(UTF8.getBytes("

" + UTF8.String(sourceArray) + "

")); + } else { + bis = new ByteArrayInputStream(sourceArray); + } try { docs = parser.parse(location, mimeType, documentCharset, bis); } catch (final Parser.Failure e) { @@ -477,73 +472,4 @@ public final class TextParser { if (grant) denyExtensionx.remove(ext); else denyExtensionx.put(ext, v); } - /** - * produce virtual documents for each of the link that is contained in the document - * @param document - * @return - */ - public static Document[] virtualDocs(final Document document) { - - final ArrayList docs = new ArrayList(); - docs.add(document); - for (final Map.Entry link: document.getApplinks().entrySet()) { - docs.add(genLinkDocs("application", link.getKey(), link.getValue(), document.getContentLanguages())); - } - for (final Map.Entry link: document.getAudiolinks().entrySet()) { - docs.add(genLinkDocs("audio", link.getKey(), link.getValue(), document.getContentLanguages())); - } - for (final Map.Entry link: document.getVideolinks().entrySet()) { - docs.add(genLinkDocs("video", link.getKey(), link.getValue(), document.getContentLanguages())); - } - for (final Entry link: document.getImages().entrySet()) { - docs.add(genImageDocs(link.getValue())); - } - - // finally return the list of documents - return docs.toArray(new Document[docs.size()]); - } - - private final static Document genLinkDocs(final String type, final MultiProtocolURI uri, final String descr, final Set contentLanguages) { - //System.out.println("HTMLPARSER-LINK " + type + ": " + uri.toNormalform(true, false) + " / " + descr); - return new Document( - uri, - Classification.ext2mime(uri.getFileExtension()), - "UTF-8", - null, - contentLanguages, - null, - descr, - "", - "", - new String[]{descr}, - type, - 0.0f, 0.0f, - uri.toNormalform(false, false), - null, - null, - null, - false); - } - - private final static Document genImageDocs(final ImageEntry img) { - //System.out.println("HTMLPARSER-LINK image: " + img.url().toNormalform(true, false) + " / " + img.alt()); - return new Document( - img.url(), - Classification.ext2mime(img.url().getFileExtension()), - "UTF-8", - null, - null, - null, - img.alt(), - "", - "", - new String[]{img.alt()}, - "image", - 0.0f, 0.0f, - img.url().toNormalform(false, false), - null, - null, - null, - false); - } } diff --git a/source/net/yacy/document/importer/MediawikiImporter.java b/source/net/yacy/document/importer/MediawikiImporter.java index 432df2181..d45657a19 100644 --- a/source/net/yacy/document/importer/MediawikiImporter.java +++ b/source/net/yacy/document/importer/MediawikiImporter.java @@ -101,14 +101,17 @@ public class MediawikiImporter extends Thread implements Importer { this.urlStub = null; } + @Override public int count() { return this.count; } + @Override public String source() { return this.sourcefile.getAbsolutePath(); } + @Override public String status() { return ""; } @@ -117,6 +120,7 @@ public class MediawikiImporter extends Thread implements Importer { * return the number of articles per second * @return */ + @Override public int speed() { if (this.count == 0) return 0; return (int) (this.count / Math.max(1L, runningTime() )); @@ -126,14 +130,17 @@ public class MediawikiImporter extends Thread implements Importer { * return the remaining seconds for the completion of all records in milliseconds * @return */ + @Override public long remainingTime() { return Math.max(0, this.approxdocs - this.count) / Math.max(1, speed() ); } + @Override public long runningTime() { return (System.currentTimeMillis() - this.start) / 1000L; } + @Override public void run() { this.start = System.currentTimeMillis(); try { @@ -287,6 +294,7 @@ public class MediawikiImporter extends Thread implements Importer { this.mediawikixml = mediawikixml; } + @Override public void run() { try { createIndex(this.mediawikixml); @@ -365,6 +373,7 @@ public class MediawikiImporter extends Thread implements Importer { } } + @Override public Integer call() { wikisourcerecord r; try { @@ -412,6 +421,7 @@ public class MediawikiImporter extends Thread implements Importer { } } + @Override public Integer call() { wikisourcerecord r; wikiraw c; @@ -505,7 +515,7 @@ public class MediawikiImporter extends Thread implements Importer { public void genDocument() throws Parser.Failure { try { this.url = new DigestURI(this.urlStub + this.title); - final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", UTF8.getBytes(this.html), false); + final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", UTF8.getBytes(this.html)); this.document = Document.mergeDocuments(this.url, "text/html", parsed); // the wiki parser is not able to find the proper title in the source text, so it must be set here this.document.setTitle(this.title); @@ -626,6 +636,7 @@ public class MediawikiImporter extends Thread implements Importer { this.out = out; } + @Override public Integer call() { wikiparserrecord record; try { @@ -682,6 +693,7 @@ public class MediawikiImporter extends Thread implements Importer { this.outputfilename = null; } + @Override public Integer call() { wikiparserrecord record; try { diff --git a/source/net/yacy/document/parser/bzipParser.java b/source/net/yacy/document/parser/bzipParser.java index e2dba1b6c..8e6d3216d 100644 --- a/source/net/yacy/document/parser/bzipParser.java +++ b/source/net/yacy/document/parser/bzipParser.java @@ -55,6 +55,7 @@ public class bzipParser extends AbstractParser implements Parser { this.SUPPORTED_MIME_TYPES.add("application/x-stuffit"); } + @Override public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { @@ -93,7 +94,7 @@ public class bzipParser extends AbstractParser implements Parser { out.close(); // creating a new parser class to parse the unzipped content - docs = TextParser.parseSource(location, null, null, tempFile, false); + docs = TextParser.parseSource(location, null, null, tempFile); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; diff --git a/source/net/yacy/document/parser/gzipParser.java b/source/net/yacy/document/parser/gzipParser.java index 0680b9e22..db4097c5f 100644 --- a/source/net/yacy/document/parser/gzipParser.java +++ b/source/net/yacy/document/parser/gzipParser.java @@ -54,6 +54,7 @@ public class gzipParser extends AbstractParser implements Parser { this.SUPPORTED_MIME_TYPES.add("gzip/document"); } + @Override public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { File tempFile = null; @@ -78,7 +79,7 @@ public class gzipParser extends AbstractParser implements Parser { out.close(); // creating a new parser class to parse the unzipped content - docs = TextParser.parseSource(location,null,null,tempFile, false); + docs = TextParser.parseSource(location,null,null,tempFile); } catch (final Exception e) { if (e instanceof InterruptedException) throw (InterruptedException) e; if (e instanceof Parser.Failure) throw (Parser.Failure) e; diff --git a/source/net/yacy/document/parser/sevenzipParser.java b/source/net/yacy/document/parser/sevenzipParser.java index d42c625c9..041b428f6 100644 --- a/source/net/yacy/document/parser/sevenzipParser.java +++ b/source/net/yacy/document/parser/sevenzipParser.java @@ -99,6 +99,7 @@ public class sevenzipParser extends AbstractParser implements Parser { } } + @Override public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { try { @@ -166,7 +167,7 @@ public class sevenzipParser extends AbstractParser implements Parser { // below for reversion of the effects final MultiProtocolURI url = MultiProtocolURI.newURL(this.doc.dc_source(), this.prefix + "/" + super.filePath); final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1)); - theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray(), false); + theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray()); this.doc.addSubDocuments(theDocs); } diff --git a/source/net/yacy/document/parser/tarParser.java b/source/net/yacy/document/parser/tarParser.java index df7f58d12..7deb195e4 100644 --- a/source/net/yacy/document/parser/tarParser.java +++ b/source/net/yacy/document/parser/tarParser.java @@ -49,7 +49,7 @@ import org.apache.tools.tar.TarInputStream; public class tarParser extends AbstractParser implements Parser { private final static String MAGIC = "ustar"; // A magic for a tar archive, may appear at #101h-#105 - + public tarParser() { super("Tape Archive File Parser"); this.SUPPORTED_EXTENSIONS.add("tar"); @@ -59,6 +59,7 @@ public class tarParser extends AbstractParser implements Parser { this.SUPPORTED_MIME_TYPES.add("multipart/x-tar"); } + @Override public Document[] parse(final MultiProtocolURI url, final String mimeType, final String charset, InputStream source) throws Parser.Failure, InterruptedException { final List docacc = new ArrayList(); @@ -88,7 +89,7 @@ public class tarParser extends AbstractParser implements Parser { try { tmp = FileUtils.createTempFile(this.getClass(), name); FileUtils.copy(tis, tmp, entry.getSize()); - subDocs = TextParser.parseSource(MultiProtocolURI.newURL(url,"#" + name), mime, null, tmp, false); + subDocs = TextParser.parseSource(MultiProtocolURI.newURL(url,"#" + name), mime, null, tmp); if (subDocs == null) continue; for (final Document d: subDocs) docacc.add(d); } catch (final Parser.Failure e) { @@ -103,7 +104,7 @@ public class tarParser extends AbstractParser implements Parser { } return docacc.toArray(new Document[docacc.size()]); } - + public final static boolean isTar(File f) { if (!f.exists() || f.length() < 0x105) return false; try { diff --git a/source/net/yacy/document/parser/zipParser.java b/source/net/yacy/document/parser/zipParser.java index b216fe099..d1d6277cf 100644 --- a/source/net/yacy/document/parser/zipParser.java +++ b/source/net/yacy/document/parser/zipParser.java @@ -59,6 +59,7 @@ public class zipParser extends AbstractParser implements Parser { this.SUPPORTED_MIME_TYPES.add("application/vnd.android.package-archive"); } + @Override public Document[] parse(final MultiProtocolURI url, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException { @@ -87,7 +88,7 @@ public class zipParser extends AbstractParser implements Parser { FileUtils.copy(zis, tmp, entry.getSize()); final MultiProtocolURI virtualURL = MultiProtocolURI.newURL(url, "#" + name); //this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false)); - docs = TextParser.parseSource(virtualURL, mime, null, tmp, false); + docs = TextParser.parseSource(virtualURL, mime, null, tmp); if (docs == null) continue; for (final Document d: docs) docacc.add(d); } catch (final Parser.Failure e) { diff --git a/source/net/yacy/repository/LoaderDispatcher.java b/source/net/yacy/repository/LoaderDispatcher.java index 0a1e1a975..254f0c66a 100644 --- a/source/net/yacy/repository/LoaderDispatcher.java +++ b/source/net/yacy/repository/LoaderDispatcher.java @@ -386,7 +386,7 @@ public final class LoaderDispatcher { final String supportError = TextParser.supports(url, responseHeader.mime()); if (supportError != null) throw new IOException("no parser support: " + supportError); try { - documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.getContent(), false); + documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.getContent()); if (documents == null) throw new IOException("document == null"); } catch (final Exception e) { throw new IOException("parser error: " + e.getMessage()); diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index ce4252d9b..7acda2996 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -2329,8 +2329,7 @@ public final class Switchboard extends serverSwitch response.url(), response.getMimeType(), response.getCharacterEncoding(), - response.getContent(), - response.profile().directDocByURL()); + response.getContent()); if ( documents == null ) { throw new Parser.Failure("Parser returned null.", response.url()); } diff --git a/source/net/yacy/search/index/DocumentIndex.java b/source/net/yacy/search/index/DocumentIndex.java index e74df82a8..e29397e4f 100644 --- a/source/net/yacy/search/index/DocumentIndex.java +++ b/source/net/yacy/search/index/DocumentIndex.java @@ -150,7 +150,7 @@ public class DocumentIndex extends Segment length = -1; } try { - documents = TextParser.parseSource(url, null, null, length, url.getInputStream(null, -1), true); + documents = TextParser.parseSource(url, null, null, length, url.getInputStream(null, -1)); } catch ( final Exception e ) { throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage()); }