- Redesigned crawler and parser to accept embedded links from the NOLOAD

queue and not from virtual documents generated by the parser.
- The parser now generates nice description texts for NOLOAD entries
which shall make it possible to find media content using the search
index and not using the media prefetch algorithm during search (which
was costly)
- Removed the media-search prefetch process from image search
pull/1/head
Michael Peter Christen 13 years ago
parent 3bea25c513
commit 659178942f

@ -56,7 +56,7 @@ import de.anomic.crawler.retrieval.Request;
public class Balancer {
private static final String indexSuffix = "9.db";
private static final String indexSuffix = "A.db";
private static final int EcoFSBufferSize = 1000;
private static final int objectIndexBufferSize = 1000;
private static final String localhost = "localhost";

@ -60,8 +60,8 @@ import de.anomic.crawler.retrieval.Response;
public class CrawlQueues {
private static final String ERROR_DB_FILENAME = "urlError3.db";
private static final String DELEGATED_DB_FILENAME = "urlDelegated3.db";
private static final String ERROR_DB_FILENAME = "urlError4.db";
private static final String DELEGATED_DB_FILENAME = "urlDelegated4.db";
private static final Segments.Process PROCESS = Segments.Process.LOCALCRAWLING;
protected Switchboard sb;

@ -159,7 +159,7 @@ public final class HTTPLoader {
// check if the url was already indexed
final String dbname = this.sb.urlExists(Segments.Process.LOCALCRAWLING, redirectionUrl.hash());
if (dbname != null) {
if (dbname != null) { //OTTO
this.sb.crawlQueues.errorURL.push(request, this.sb.peers.mySeed().hash.getBytes(), new Date(), 1, FailCategory.TEMPORARY_NETWORK_FAILURE, "redirection to double content", code);
throw new IOException("CRAWLER Redirection of URL=" + request.url().toString() + " ignored. The url appears already in db " + dbname);
}

@ -53,8 +53,8 @@ public class Request extends WorkflowJob
+ Word.commonHashLength
+ ", "
+ // the url's referrer hash
"String urlname-80, "
+ // the name of the url, from anchor tag <a>name</a>
"String urlname-256, "
+ // the name of the url, from anchor tag <a>name</a> (must be big to transport NOLOAD entries)
"Cardinal appdate-8 {b256}, "
+ // the date of the resource; either file date or first appearance
"String profile-"
@ -78,6 +78,8 @@ public class Request extends WorkflowJob
"Cardinal size-8 {b256}", // size of resource in bytes (if known) or 0 if not known
Base64Order.enhancedCoder);
public final static int descrLength = rowdef.column(4).cellwidth;
private byte[] initiator; // the initiator hash, is NULL or "" if it is the own proxy;
// if this is generated by a crawl, the own peer hash in entered
private byte[] refhash; // the url's referrer hash

@ -162,16 +162,23 @@ public class Response {
this.content = content;
}
/**
* create a 'virtual' response that is composed using crawl details from the request object
* this is used when the NOLOAD queue is processed
* @param request
* @param profile
*/
public Response(final Request request, final CrawlProfile profile) {
this.request = request;
// request and response headers may be zero in case that we process surrogates
this.requestHeader = new RequestHeader();
this.responseHeader = new ResponseHeader();
this.responseHeader.put(HeaderFramework.CONTENT_TYPE, "text/plain"); // tell parser how to handle the content
if (request.size() > 0) this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(request.size()));
this.responseStatus = "200";
this.profile = profile;
this.status = QUEUE_STATE_FRESH;
this.content = request.url().toTokens().getBytes();
this.content = request.name().length() > 0 ? request.name().getBytes() : request.url().toTokens().getBytes();
}
public Response(
@ -824,7 +831,7 @@ public class Response {
final String supportError = TextParser.supports(url(), this.responseHeader == null ? null : this.responseHeader.mime());
if (supportError != null) throw new Parser.Failure("no parser support:" + supportError, url());
try {
return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content, false);
return TextParser.parseSource(url(), this.responseHeader == null ? null : this.responseHeader.mime(), this.responseHeader == null ? "UTF-8" : this.responseHeader.getCharacterEncoding(), this.content);
} catch (final Exception e) {
return null;
}

@ -60,6 +60,7 @@ import net.yacy.document.parser.html.ImageEntry;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.ByteBuffer;
import net.yacy.kelondro.util.FileUtils;
import de.anomic.crawler.retrieval.Request;
public class Document {
@ -827,7 +828,8 @@ dc_rights
final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
for (final Document d: documents) {
for (final ImageEntry imageReference : d.getImages().values()) {
result.put(imageReference.url(), imageReference.alt());
// construct a image name which contains the document title to enhance the search process for images
result.put(imageReference.url(), description(d, imageReference.alt()));
}
}
return result;
@ -835,20 +837,57 @@ dc_rights
public static Map<MultiProtocolURI, String> getAudiolinks(final Document[] documents) {
final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
for (final Document d: documents) result.putAll(d.audiolinks);
for (final Document d: documents) {
for (Map.Entry<MultiProtocolURI, String> e: d.audiolinks.entrySet()) {
result.put(e.getKey(), description(d, e.getValue()));
}
}
return result;
}
public static Map<MultiProtocolURI, String> getVideolinks(final Document[] documents) {
final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
for (final Document d: documents) result.putAll(d.videolinks);
for (final Document d: documents) {
for (Map.Entry<MultiProtocolURI, String> e: d.videolinks.entrySet()) {
result.put(e.getKey(), description(d, e.getValue()));
}
}
return result;
}
public static Map<MultiProtocolURI, String> getApplinks(final Document[] documents) {
final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
for (final Document d: documents) result.putAll(d.applinks);
for (final Document d: documents) {
for (Map.Entry<MultiProtocolURI, String> e: d.applinks.entrySet()) {
result.put(e.getKey(), description(d, e.getValue()));
}
}
return result;
}
private static final String description(Document d, String tagname) {
if (tagname == null || tagname.length() == 0) {
tagname = d.source.toTokens();
}
StringBuilder sb = new StringBuilder(60);
sb.append(d.dc_title());
if (!d.dc_description().equals(d.dc_title()) && sb.length() < Request.descrLength - tagname.length()) {
sb.append(' ');
sb.append(d.dc_description());
}
if (sb.length() < Request.descrLength - tagname.length()) {
sb.append(' ');
sb.append(d.dc_subject(','));
}
if (tagname.length() > 0) {
if (sb.length() > Request.descrLength - tagname.length() - 3) {
// cut this off because otherwise the tagname is lost.
sb.setLength(Request.descrLength - tagname.length() - 3);
}
sb.append(" - ");
sb.append(tagname);
}
return sb.toString().trim();
}
}

@ -31,12 +31,11 @@ import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import java.util.concurrent.ConcurrentHashMap;
import net.yacy.cora.document.Classification;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.document.parser.bzipParser;
import net.yacy.document.parser.csvParser;
import net.yacy.document.parser.docParser;
@ -60,7 +59,6 @@ import net.yacy.document.parser.vcfParser;
import net.yacy.document.parser.vsdParser;
import net.yacy.document.parser.xlsParser;
import net.yacy.document.parser.zipParser;
import net.yacy.document.parser.html.ImageEntry;
import net.yacy.document.parser.images.genericImageParser;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
@ -144,8 +142,7 @@ public final class TextParser {
final MultiProtocolURI location,
final String mimeType,
final String charset,
final File sourceFile,
final boolean multipleVirtualDocs
final File sourceFile
) throws InterruptedException, Parser.Failure {
BufferedInputStream sourceStream = null;
@ -158,7 +155,7 @@ public final class TextParser {
throw new Parser.Failure(errorMsg, location);
}
sourceStream = new BufferedInputStream(new FileInputStream(sourceFile));
docs = parseSource(location, mimeType, charset, sourceFile.length(), sourceStream, multipleVirtualDocs);
docs = parseSource(location, mimeType, charset, sourceFile.length(), sourceStream);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;
@ -176,8 +173,7 @@ public final class TextParser {
final MultiProtocolURI location,
String mimeType,
final String charset,
final byte[] content,
final boolean multipleVirtualDocs
final byte[] content
) throws Parser.Failure {
if (log.isFine()) log.logFine("Parsing '" + location + "' from byte-array");
mimeType = normalizeMimeType(mimeType);
@ -193,9 +189,6 @@ public final class TextParser {
Document[] docs = parseSource(location, mimeType, idioms, charset, content);
// finally enrich the docs set with virtual docs from the enclosed documents
if (multipleVirtualDocs && docs.length == 1) docs = virtualDocs(docs[0]);
return docs;
}
@ -204,8 +197,7 @@ public final class TextParser {
String mimeType,
final String charset,
final long contentLength,
final InputStream sourceStream,
final boolean multipleVirtualDocs
final InputStream sourceStream
) throws Parser.Failure {
if (log.isFine()) log.logFine("Parsing '" + location + "' from stream");
mimeType = normalizeMimeType(mimeType);
@ -236,9 +228,6 @@ public final class TextParser {
}
Document[] docs = parseSource(location, mimeType, idioms, charset, b);
// finally enrich the docs set with virtual docs from the enclosed documents
if (multipleVirtualDocs && docs.length == 1) docs = virtualDocs(docs[0]);
return docs;
}
@ -281,7 +270,13 @@ public final class TextParser {
final HashMap<Parser, Parser.Failure> failedParser = new HashMap<Parser, Parser.Failure>();
if (MemoryControl.request(sourceArray.length * 6, false)) {
for (final Parser parser: parsers) {
ByteArrayInputStream bis = new ByteArrayInputStream(sourceArray);
ByteArrayInputStream bis;
if (mimeType.equals("text/plain") && parser.getName().equals("HTML Parser")) {
// a hack to simulate html files .. is needed for NOLOAD queues. This throws their data into virtual text/plain messages.
bis = new ByteArrayInputStream(UTF8.getBytes("<html><head></head><body><h1>" + UTF8.String(sourceArray) + "</h1></body><html>"));
} else {
bis = new ByteArrayInputStream(sourceArray);
}
try {
docs = parser.parse(location, mimeType, documentCharset, bis);
} catch (final Parser.Failure e) {
@ -477,73 +472,4 @@ public final class TextParser {
if (grant) denyExtensionx.remove(ext); else denyExtensionx.put(ext, v);
}
/**
* produce virtual documents for each of the link that is contained in the document
* @param document
* @return
*/
public static Document[] virtualDocs(final Document document) {
final ArrayList<Document> docs = new ArrayList<Document>();
docs.add(document);
for (final Map.Entry<MultiProtocolURI, String> link: document.getApplinks().entrySet()) {
docs.add(genLinkDocs("application", link.getKey(), link.getValue(), document.getContentLanguages()));
}
for (final Map.Entry<MultiProtocolURI, String> link: document.getAudiolinks().entrySet()) {
docs.add(genLinkDocs("audio", link.getKey(), link.getValue(), document.getContentLanguages()));
}
for (final Map.Entry<MultiProtocolURI, String> link: document.getVideolinks().entrySet()) {
docs.add(genLinkDocs("video", link.getKey(), link.getValue(), document.getContentLanguages()));
}
for (final Entry<MultiProtocolURI, ImageEntry> link: document.getImages().entrySet()) {
docs.add(genImageDocs(link.getValue()));
}
// finally return the list of documents
return docs.toArray(new Document[docs.size()]);
}
private final static Document genLinkDocs(final String type, final MultiProtocolURI uri, final String descr, final Set<String> contentLanguages) {
//System.out.println("HTMLPARSER-LINK " + type + ": " + uri.toNormalform(true, false) + " / " + descr);
return new Document(
uri,
Classification.ext2mime(uri.getFileExtension()),
"UTF-8",
null,
contentLanguages,
null,
descr,
"",
"",
new String[]{descr},
type,
0.0f, 0.0f,
uri.toNormalform(false, false),
null,
null,
null,
false);
}
private final static Document genImageDocs(final ImageEntry img) {
//System.out.println("HTMLPARSER-LINK image: " + img.url().toNormalform(true, false) + " / " + img.alt());
return new Document(
img.url(),
Classification.ext2mime(img.url().getFileExtension()),
"UTF-8",
null,
null,
null,
img.alt(),
"",
"",
new String[]{img.alt()},
"image",
0.0f, 0.0f,
img.url().toNormalform(false, false),
null,
null,
null,
false);
}
}

@ -101,14 +101,17 @@ public class MediawikiImporter extends Thread implements Importer {
this.urlStub = null;
}
@Override
public int count() {
return this.count;
}
@Override
public String source() {
return this.sourcefile.getAbsolutePath();
}
@Override
public String status() {
return "";
}
@ -117,6 +120,7 @@ public class MediawikiImporter extends Thread implements Importer {
* return the number of articles per second
* @return
*/
@Override
public int speed() {
if (this.count == 0) return 0;
return (int) (this.count / Math.max(1L, runningTime() ));
@ -126,14 +130,17 @@ public class MediawikiImporter extends Thread implements Importer {
* return the remaining seconds for the completion of all records in milliseconds
* @return
*/
@Override
public long remainingTime() {
return Math.max(0, this.approxdocs - this.count) / Math.max(1, speed() );
}
@Override
public long runningTime() {
return (System.currentTimeMillis() - this.start) / 1000L;
}
@Override
public void run() {
this.start = System.currentTimeMillis();
try {
@ -287,6 +294,7 @@ public class MediawikiImporter extends Thread implements Importer {
this.mediawikixml = mediawikixml;
}
@Override
public void run() {
try {
createIndex(this.mediawikixml);
@ -365,6 +373,7 @@ public class MediawikiImporter extends Thread implements Importer {
}
}
@Override
public Integer call() {
wikisourcerecord r;
try {
@ -412,6 +421,7 @@ public class MediawikiImporter extends Thread implements Importer {
}
}
@Override
public Integer call() {
wikisourcerecord r;
wikiraw c;
@ -505,7 +515,7 @@ public class MediawikiImporter extends Thread implements Importer {
public void genDocument() throws Parser.Failure {
try {
this.url = new DigestURI(this.urlStub + this.title);
final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", UTF8.getBytes(this.html), false);
final Document[] parsed = TextParser.parseSource(this.url, "text/html", "UTF-8", UTF8.getBytes(this.html));
this.document = Document.mergeDocuments(this.url, "text/html", parsed);
// the wiki parser is not able to find the proper title in the source text, so it must be set here
this.document.setTitle(this.title);
@ -626,6 +636,7 @@ public class MediawikiImporter extends Thread implements Importer {
this.out = out;
}
@Override
public Integer call() {
wikiparserrecord record;
try {
@ -682,6 +693,7 @@ public class MediawikiImporter extends Thread implements Importer {
this.outputfilename = null;
}
@Override
public Integer call() {
wikiparserrecord record;
try {

@ -55,6 +55,7 @@ public class bzipParser extends AbstractParser implements Parser {
this.SUPPORTED_MIME_TYPES.add("application/x-stuffit");
}
@Override
public Document[] parse(final MultiProtocolURI location, final String mimeType,
final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {
@ -93,7 +94,7 @@ public class bzipParser extends AbstractParser implements Parser {
out.close();
// creating a new parser class to parse the unzipped content
docs = TextParser.parseSource(location, null, null, tempFile, false);
docs = TextParser.parseSource(location, null, null, tempFile);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;

@ -54,6 +54,7 @@ public class gzipParser extends AbstractParser implements Parser {
this.SUPPORTED_MIME_TYPES.add("gzip/document");
}
@Override
public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
File tempFile = null;
@ -78,7 +79,7 @@ public class gzipParser extends AbstractParser implements Parser {
out.close();
// creating a new parser class to parse the unzipped content
docs = TextParser.parseSource(location,null,null,tempFile, false);
docs = TextParser.parseSource(location,null,null,tempFile);
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
if (e instanceof Parser.Failure) throw (Parser.Failure) e;

@ -99,6 +99,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
}
}
@Override
public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset,
final InputStream source) throws Parser.Failure, InterruptedException {
try {
@ -166,7 +167,7 @@ public class sevenzipParser extends AbstractParser implements Parser {
// below for reversion of the effects
final MultiProtocolURI url = MultiProtocolURI.newURL(this.doc.dc_source(), this.prefix + "/" + super.filePath);
final String mime = TextParser.mimeOf(super.filePath.substring(super.filePath.lastIndexOf('.') + 1));
theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray(), false);
theDocs = TextParser.parseSource(url, mime, null, this.cfos.toByteArray());
this.doc.addSubDocuments(theDocs);
}

@ -49,7 +49,7 @@ import org.apache.tools.tar.TarInputStream;
public class tarParser extends AbstractParser implements Parser {
private final static String MAGIC = "ustar"; // A magic for a tar archive, may appear at #101h-#105
public tarParser() {
super("Tape Archive File Parser");
this.SUPPORTED_EXTENSIONS.add("tar");
@ -59,6 +59,7 @@ public class tarParser extends AbstractParser implements Parser {
this.SUPPORTED_MIME_TYPES.add("multipart/x-tar");
}
@Override
public Document[] parse(final MultiProtocolURI url, final String mimeType, final String charset, InputStream source) throws Parser.Failure, InterruptedException {
final List<Document> docacc = new ArrayList<Document>();
@ -88,7 +89,7 @@ public class tarParser extends AbstractParser implements Parser {
try {
tmp = FileUtils.createTempFile(this.getClass(), name);
FileUtils.copy(tis, tmp, entry.getSize());
subDocs = TextParser.parseSource(MultiProtocolURI.newURL(url,"#" + name), mime, null, tmp, false);
subDocs = TextParser.parseSource(MultiProtocolURI.newURL(url,"#" + name), mime, null, tmp);
if (subDocs == null) continue;
for (final Document d: subDocs) docacc.add(d);
} catch (final Parser.Failure e) {
@ -103,7 +104,7 @@ public class tarParser extends AbstractParser implements Parser {
}
return docacc.toArray(new Document[docacc.size()]);
}
public final static boolean isTar(File f) {
if (!f.exists() || f.length() < 0x105) return false;
try {

@ -59,6 +59,7 @@ public class zipParser extends AbstractParser implements Parser {
this.SUPPORTED_MIME_TYPES.add("application/vnd.android.package-archive");
}
@Override
public Document[] parse(final MultiProtocolURI url, final String mimeType,
final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {
@ -87,7 +88,7 @@ public class zipParser extends AbstractParser implements Parser {
FileUtils.copy(zis, tmp, entry.getSize());
final MultiProtocolURI virtualURL = MultiProtocolURI.newURL(url, "#" + name);
//this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false));
docs = TextParser.parseSource(virtualURL, mime, null, tmp, false);
docs = TextParser.parseSource(virtualURL, mime, null, tmp);
if (docs == null) continue;
for (final Document d: docs) docacc.add(d);
} catch (final Parser.Failure e) {

@ -386,7 +386,7 @@ public final class LoaderDispatcher {
final String supportError = TextParser.supports(url, responseHeader.mime());
if (supportError != null) throw new IOException("no parser support: " + supportError);
try {
documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.getContent(), false);
documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), response.getContent());
if (documents == null) throw new IOException("document == null");
} catch (final Exception e) {
throw new IOException("parser error: " + e.getMessage());

@ -2329,8 +2329,7 @@ public final class Switchboard extends serverSwitch
response.url(),
response.getMimeType(),
response.getCharacterEncoding(),
response.getContent(),
response.profile().directDocByURL());
response.getContent());
if ( documents == null ) {
throw new Parser.Failure("Parser returned null.", response.url());
}

@ -150,7 +150,7 @@ public class DocumentIndex extends Segment
length = -1;
}
try {
documents = TextParser.parseSource(url, null, null, length, url.getInputStream(null, -1), true);
documents = TextParser.parseSource(url, null, null, length, url.getInputStream(null, -1));
} catch ( final Exception e ) {
throw new IOException("cannot parse " + url.toString() + ": " + e.getMessage());
}

Loading…
Cancel
Save