- applied many small performance hacks

- added a memory limitation in the zip parser and the pdf parser
- added a search throttling: if there are too many search queries are still to be computed, then new requests are not accepted for some time. if after a one second still no space is there to perform another search, the search terminates with no results. this case should only happen in case of DoS-like situations and in case of strong load on a peer like if it is integrated in metager.
- added a search cache deletion process that removes search requests in case that throttling happens

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7766 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 900dacbf97
commit 0c1b29f3c9

@ -40,10 +40,14 @@ import net.yacy.repository.BlacklistFile;
import de.anomic.search.SearchEventCache;
import de.anomic.search.Switchboard;
import java.util.List;
import java.util.regex.Pattern;
// The Naming of the functions is a bit strange...
public class ListManager {
private final static Pattern commaPattern = Pattern.compile(",");
public static Switchboard switchboard = null;
public static File listsPath = null;
@ -143,7 +147,7 @@ public class ListManager {
ArrayList<String> list;
if (string != null && string.length() > 0) {
list = new ArrayList<String>(Arrays.asList(string.split(",")));
list = new ArrayList<String>(Arrays.asList(commaPattern.split(string, 0)));
} else {
list = new ArrayList<String>();
}
@ -161,7 +165,7 @@ public class ListManager {
HashSet<String> set;
if (string != null) {
set = new HashSet<String>(Arrays.asList(string.split(",")));
set = new HashSet<String>(Arrays.asList(commaPattern.split(string, 0)));
} else {
set = new HashSet<String>();
}
@ -180,7 +184,7 @@ public class ListManager {
Vector<String> v;
if (string != null) {
v = new Vector<String>(Arrays.asList(string.split(",")));
v = new Vector<String>(Arrays.asList(commaPattern.split(string, 0)));
} else {
v = new Vector<String>();
}

@ -10,7 +10,7 @@
// $LastChangedBy$
//
// LICENSE
//
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
@ -31,7 +31,7 @@ import java.io.IOException;
import java.io.OutputStream;
import java.util.HashMap;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.util.ByteBuffer;
@ -42,12 +42,12 @@ public class ServerSideIncludes {
public static void writeSSI(final ByteBuffer in, final OutputStream out, final String authorization, final String requesthost) throws IOException {
writeSSI(in, 0, out, authorization, requesthost);
}
public static void writeSSI(final ByteBuffer in, int off, final OutputStream out, final String authorization, final String requesthost) throws IOException {
int p = in.indexOf(UTF8.getBytes("<!--#"), off);
int p = in.indexOf(ASCII.getBytes("<!--#"), off);
int q;
while (p >= 0) {
q = in.indexOf(UTF8.getBytes("-->"), p + 10);
q = in.indexOf(ASCII.getBytes("-->"), p + 10);
if (out instanceof ChunkedOutputStream) {
((ChunkedOutputStream) out).write(in, off, p - off);
} else {
@ -55,7 +55,7 @@ public class ServerSideIncludes {
}
parseSSI(in, p, out, authorization, requesthost);
off = q + 3;
p = in.indexOf(UTF8.getBytes("<!--#"), off);
p = in.indexOf(ASCII.getBytes("<!--#"), off);
}
if (out instanceof ChunkedOutputStream) {
((ChunkedOutputStream) out).write(in, off, in.length() - off);
@ -63,17 +63,17 @@ public class ServerSideIncludes {
out.write(in.getBytes(off, in.length() - off));
}
}
private static void parseSSI(final ByteBuffer in, final int off, final OutputStream out, final String authorization, final String requesthost) {
if (in.startsWith(UTF8.getBytes("<!--#include virtual=\""), off)) {
final int q = in.indexOf(UTF8.getBytes("\""), off + 22);
if (in.startsWith(ASCII.getBytes("<!--#include virtual=\""), off)) {
final int q = in.indexOf(ASCII.getBytes("\""), off + 22);
if (q > 0) {
final String path = in.toString(off + 22, q - off - 22);
writeContent(path, out, authorization, requesthost);
}
}
}
private static void writeContent(String path, final OutputStream out, final String authorization, final String requesthost) {
// check if there are arguments in path string
String args = "";
@ -82,7 +82,7 @@ public class ServerSideIncludes {
args = path.substring(argpos + 1);
path = path.substring(0, argpos);
}
// set up virtual connection properties to call httpdFileHander.doGet()
final HashMap<String, Object> conProp = new HashMap<String, Object>();
final RequestHeader header = new RequestHeader(HTTPDemon.reverseMappingCache);

@ -1,4 +1,4 @@
//TemplateEngine.java
//TemplateEngine.java
//-------------------------------------
//(C) by Michael Peter Christen; mc@yacy.net
//first published on http://www.anomic.de
@ -58,6 +58,7 @@ import java.io.PushbackInputStream;
import java.util.HashMap;
import java.util.Map;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.ByteBuffer;
@ -143,9 +144,9 @@ public final class TemplateEngine {
public final static byte[] iOpen = {hashChar, pcChar};
public final static byte[] iClose = {pcChar, hashChar};
public final static byte[] ul = "_".getBytes();
/**
* transfer until a specified pattern is found; everything but the pattern is transfered so far
* the function returns true, if the pattern is found
@ -172,7 +173,7 @@ public final class TemplateEngine {
}
return false;
}
private final static boolean transferUntil(final PushbackInputStream i, final OutputStream o, final byte p) throws IOException {
int b;
while ((b = i.read()) > 0) {
@ -195,7 +196,7 @@ public final class TemplateEngine {
*/
private final static byte[] writeTemplate(final InputStream in, final OutputStream out, final Map<String, String> pattern, final byte[] dflt, final byte[] prefix) throws IOException {
final PushbackInputStream pis = new PushbackInputStream(in, 100);
ByteArrayOutputStream keyStream = new ByteArrayOutputStream(512);
final ByteArrayOutputStream keyStream = new ByteArrayOutputStream(512);
byte[] key;
byte[] multi_key;
byte[] replacement;
@ -204,7 +205,7 @@ public final class TemplateEngine {
while (transferUntil(pis, out, hashChar)) {
bb = pis.read();
keyStream.reset();
// #{
if ((bb & 0xFF) == lcbr) { //multi
if (transferUntil(pis, keyStream, mClose)) { //close tag
@ -250,12 +251,12 @@ public final class TemplateEngine {
Log.logSevere("TEMPLATE", "No Close Key found for #{"+UTF8.String(multi_key)+"}#"); //prefix here?
}
}
// #(
} else if ((bb & 0xFF) == lrbr) { //alternative
int others=0;
final ByteBuffer text= new ByteBuffer();
transferUntil(pis, keyStream, aClose);
key = keyStream.toByteArray(); //Caution: Key does not contain prefix
@ -303,7 +304,7 @@ public final class TemplateEngine {
if ((bb & 0xFF) == lrbr){
transferUntil(pis, keyStream, aClose);
//reached the end. output last string.
//reached the end. output last string.
if (java.util.Arrays.equals(keyStream.toByteArray(),appendBytes(slashChar, key, null,null))) {
pis2 = new PushbackInputStream(new ByteArrayInputStream(text.getBytes()));
//this maybe the wrong, but its the last
@ -352,7 +353,7 @@ public final class TemplateEngine {
}
}//while
}//if(byName) (else branch)
// #[
} else if ((bb & 0xFF) == lbr) { //normal
if (transferUntil(pis, keyStream, pClose)) {
@ -360,9 +361,11 @@ public final class TemplateEngine {
key = keyStream.toByteArray();
final String patternKey = getPatternKey(prefix, key);
replacement = replacePattern(patternKey, pattern, dflt); //replace
structure.append(UTF8.getBytes("<")).append(key).append(UTF8.getBytes(" type=\"normal\">\n"));
structure.append(ASCII.getBytes("<")).append(key)
.append(ASCII.getBytes(" type=\"normal\">\n"));
structure.append(replacement);
structure.append(UTF8.getBytes("</")).append(key).append(UTF8.getBytes(">\n"));
structure.append(ASCII.getBytes("</")).append(key)
.append(ASCII.getBytes(">\n"));
FileUtils.copy(replacement, out);
} else {
@ -370,10 +373,10 @@ public final class TemplateEngine {
FileUtils.copy(pis, out);
return structure.getBytes();
}
// #%
} else if ((bb & 0xFF) == pcChar) { //include
final ByteBuffer include = new ByteBuffer();
final ByteBuffer include = new ByteBuffer();
keyStream.reset(); //reset stream
if(transferUntil(pis, keyStream, iClose)){
byte[] filename = keyStream.toByteArray();
@ -395,7 +398,7 @@ public final class TemplateEngine {
include.append(UTF8.getBytes(line)).append(UTF8.getBytes(de.anomic.server.serverCore.CRLF_STRING));
}
} catch (final IOException e) {
//file not found?
//file not found?
Log.logSevere("FILEHANDLER","Include Error with file " + UTF8.String(filename) + ": " + e.getMessage());
} finally {
if (br != null) try { br.close(); br=null; } catch (final Exception e) {}
@ -406,7 +409,7 @@ public final class TemplateEngine {
structure.append(UTF8.getBytes("</fileinclude>\n"));
}
}
// # - no special character. This is simply a '#' without meaning
} else { //no match, but a single hash (output # + bb)
out.write(hashChar);
@ -439,10 +442,10 @@ public final class TemplateEngine {
private final static byte[] newPrefix(final byte[] oldPrefix, final byte[] key) {
final ByteBuffer newPrefix = new ByteBuffer(oldPrefix.length + key.length + 1);
newPrefix.append(oldPrefix).append(key).append(ul);
byte[] result = newPrefix.getBytes();
final byte[] result = newPrefix.getBytes();
try {
newPrefix.close();
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
}
return result;
@ -453,7 +456,7 @@ public final class TemplateEngine {
newPrefix.append(oldPrefix).append(multi_key).append(ul).append(UTF8.getBytes(Integer.toString(i))).append(ul);
try {
newPrefix.close();
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
}
return newPrefix.getBytes();
@ -467,12 +470,12 @@ public final class TemplateEngine {
} finally {
try {
patternKey.close();
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
}
}
}
private final static byte[] appendBytes(final byte[] b1, final byte[] b2, final byte[] b3, final byte[] b4) {
final ByteBuffer byteArray = new ByteBuffer(b1.length + b2.length + (b3 == null ? 0 : b3.length) + (b4 == null ? 0 : b4.length));
byteArray.append(b1).append(b2);
@ -481,7 +484,7 @@ public final class TemplateEngine {
final byte[] result = byteArray.getBytes();
try {
byteArray.close();
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
}
return result;

@ -9,7 +9,7 @@
// $LastChangedBy$
//
// LICENSE
//
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
@ -33,9 +33,7 @@ import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.concurrent.Semaphore;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.storage.ReversibleScoreMap;
import net.yacy.cora.storage.ClusteredScoreMap;
import net.yacy.cora.storage.ConcurrentScoreMap;
import net.yacy.document.Condenser;
import net.yacy.document.LargeNumberCache;
import net.yacy.kelondro.data.meta.DigestURI;
@ -51,62 +49,62 @@ import net.yacy.kelondro.util.ByteBuffer;
public class ReferenceOrder {
private static int cores = Runtime.getRuntime().availableProcessors();
private int maxdomcount;
private WordReferenceVars min, max;
private final ReversibleScoreMap<String> doms; // collected for "authority" heuristic
private final ConcurrentScoreMap<String> doms; // collected for "authority" heuristic
private final RankingProfile ranking;
private final byte[] language;
public ReferenceOrder(final RankingProfile profile, byte[] language) {
public ReferenceOrder(final RankingProfile profile, final byte[] language) {
this.min = null;
this.max = null;
this.ranking = profile;
this.doms = new ClusteredScoreMap<String>();
this.doms = new ConcurrentScoreMap<String>();
this.maxdomcount = 0;
this.language = language;
}
public BlockingQueue<WordReferenceVars> normalizeWith(final ReferenceContainer<WordReference> container) {
LinkedBlockingQueue<WordReferenceVars> out = new LinkedBlockingQueue<WordReferenceVars>();
int threads = cores + 1;
if (container.size() < 20) threads = 2;
Thread distributor = new NormalizeDistributor(container, out, threads);
final LinkedBlockingQueue<WordReferenceVars> out = new LinkedBlockingQueue<WordReferenceVars>();
int threads = cores;
if (container.size() < 100) threads = 2;
final Thread distributor = new NormalizeDistributor(container, out, threads);
distributor.start();
try {
distributor.join(10); // let the distributor work for at least 10 milliseconds
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
}
// return the resulting queue while the processing queues are still working
return out;
}
private final class NormalizeDistributor extends Thread {
ReferenceContainer<WordReference> container;
LinkedBlockingQueue<WordReferenceVars> out;
private int threads;
public NormalizeDistributor(ReferenceContainer<WordReference> container, LinkedBlockingQueue<WordReferenceVars> out, int threads) {
private final int threads;
public NormalizeDistributor(final ReferenceContainer<WordReference> container, final LinkedBlockingQueue<WordReferenceVars> out, final int threads) {
this.container = container;
this.out = out;
this.threads = threads;
}
@Override
public void run() {
// transform the reference container into a stream of parsed entries
BlockingQueue<WordReferenceVars> vars = WordReferenceVars.transform(container);
final BlockingQueue<WordReferenceVars> vars = WordReferenceVars.transform(this.container);
// start the transformation threads
Semaphore termination = new Semaphore(this.threads);
NormalizeWorker[] worker = new NormalizeWorker[this.threads];
final Semaphore termination = new Semaphore(this.threads);
final NormalizeWorker[] worker = new NormalizeWorker[this.threads];
for (int i = 0; i < this.threads; i++) {
worker[i] = new NormalizeWorker(out, termination);
worker[i] = new NormalizeWorker(this.out, termination);
worker[i].start();
}
// fill the queue
WordReferenceVars iEntry;
int p = 0;
@ -115,16 +113,16 @@ public class ReferenceOrder {
worker[p % this.threads].add(iEntry);
p++;
}
} catch (InterruptedException e) {
} catch (final InterruptedException e) {
}
// insert poison to stop the queues
for (int i = 0; i < this.threads; i++) worker[i].add(WordReferenceVars.poison);
// wait for termination but not too long to make it possible that this
// is called from outside with a join to get some normalization results
// before going on
for (int i = 0; i < this.threads; i++) try {worker[i].join(100);} catch (InterruptedException e) {}
for (int i = 0; i < this.threads; i++) try {worker[i].join(100);} catch (final InterruptedException e) {}
}
}
@ -132,36 +130,36 @@ public class ReferenceOrder {
* normalize ranking: find minimum and maximum of separate ranking criteria
*/
private class NormalizeWorker extends Thread {
private final BlockingQueue<WordReferenceVars> out;
private final Semaphore termination;
private final BlockingQueue<WordReferenceVars> decodedEntries;
public NormalizeWorker(final BlockingQueue<WordReferenceVars> out, Semaphore termination) {
public NormalizeWorker(final BlockingQueue<WordReferenceVars> out, final Semaphore termination) {
this.out = out;
this.termination = termination;
this.decodedEntries = new LinkedBlockingQueue<WordReferenceVars>();
}
public void add(WordReferenceVars entry) {
public void add(final WordReferenceVars entry) {
try {
decodedEntries.put(entry);
} catch (InterruptedException e) {
this.decodedEntries.put(entry);
} catch (final InterruptedException e) {
}
}
public void run() {
try {
WordReferenceVars iEntry;
Map<String, Integer> doms0 = new HashMap<String, Integer>();
final Map<String, Integer> doms0 = new HashMap<String, Integer>();
String dom;
Integer count;
final Integer int1 = 1;
while ((iEntry = decodedEntries.take()) != WordReferenceVars.poison) {
while ((iEntry = this.decodedEntries.take()) != WordReferenceVars.poison) {
// find min/max
if (min == null) min = iEntry.clone(); else min.min(iEntry);
if (max == null) max = iEntry.clone(); else max.max(iEntry);
out.put(iEntry); // must be after the min/max check to prevent that min/max is null in cardinal()
if (ReferenceOrder.this.min == null) ReferenceOrder.this.min = iEntry.clone(); else ReferenceOrder.this.min.min(iEntry);
if (ReferenceOrder.this.max == null) ReferenceOrder.this.max = iEntry.clone(); else ReferenceOrder.this.max.max(iEntry);
this.out.put(iEntry); // must be after the min/max check to prevent that min/max is null in cardinal()
// update domcount
dom = iEntry.hosthash();
count = doms0.get(dom);
@ -177,26 +175,26 @@ public class ReferenceOrder {
final Iterator<Map.Entry<String, Integer>> di = doms0.entrySet().iterator();
while (di.hasNext()) {
entry = di.next();
doms.inc(entry.getKey(), (entry.getValue()).intValue());
ReferenceOrder.this.doms.inc(entry.getKey(), (entry.getValue()).intValue());
}
if (!doms.isEmpty()) maxdomcount = doms.getMaxScore();
} catch (InterruptedException e) {
if (!ReferenceOrder.this.doms.isEmpty()) ReferenceOrder.this.maxdomcount = ReferenceOrder.this.doms.getMaxScore();
} catch (final InterruptedException e) {
Log.logException(e);
} catch (Exception e) {
} catch (final Exception e) {
Log.logException(e);
} finally {
// insert poison to signal the termination to next queue
try {
this.termination.acquire();
if (this.termination.availablePermits() == 0) this.out.put(WordReferenceVars.poison);
} catch (InterruptedException e) {}
} catch (final InterruptedException e) {}
}
}
}
public int authority(final String hostHash) {
assert hostHash.length() == 6;
return (doms.get(hostHash) << 8) / (1 + this.maxdomcount);
return (this.doms.get(hostHash) << 8) / (1 + this.maxdomcount);
}
/**
@ -208,45 +206,45 @@ public class ReferenceOrder {
//return Long.MAX_VALUE - preRanking(ranking, iEntry, this.entryMin, this.entryMax, this.searchWords);
// the normalizedEntry must be a normalized indexEntry
final Bitfield flags = t.flags();
assert min != null;
assert max != null;
assert this.min != null;
assert this.max != null;
assert t != null;
assert ranking != null;
final long tf = ((max.termFrequency() == min.termFrequency()) ? 0 : (((int)(((t.termFrequency()-min.termFrequency())*256.0)/(max.termFrequency() - min.termFrequency())))) << ranking.coeff_termfrequency);
assert this.ranking != null;
final long tf = ((this.max.termFrequency() == this.min.termFrequency()) ? 0 : (((int)(((t.termFrequency()-this.min.termFrequency())*256.0)/(this.max.termFrequency() - this.min.termFrequency())))) << this.ranking.coeff_termfrequency);
//System.out.println("tf(" + t.urlHash + ") = " + Math.floor(1000 * t.termFrequency()) + ", min = " + Math.floor(1000 * min.termFrequency()) + ", max = " + Math.floor(1000 * max.termFrequency()) + ", tf-normed = " + tf);
int maxmaxpos = max.maxposition();
int minminpos = min.minposition();
final int maxmaxpos = this.max.maxposition();
final int minminpos = this.min.minposition();
final long r =
((256 - DigestURI.domLengthNormalized(t.urlhash())) << ranking.coeff_domlength)
+ ((ranking.coeff_ybr > 12) ? ((256 - (BlockRank.ranking(t.urlhash()) << 4)) << ranking.coeff_ybr) : 0)
+ ((max.urlcomps() == min.urlcomps() ) ? 0 : (256 - (((t.urlcomps() - min.urlcomps() ) << 8) / (max.urlcomps() - min.urlcomps()) )) << ranking.coeff_urlcomps)
+ ((max.urllength() == min.urllength() ) ? 0 : (256 - (((t.urllength() - min.urllength() ) << 8) / (max.urllength() - min.urllength()) )) << ranking.coeff_urllength)
+ ((maxmaxpos == minminpos ) ? 0 : (256 - (((t.minposition() - minminpos ) << 8) / (maxmaxpos - minminpos) )) << ranking.coeff_posintext)
+ ((max.posofphrase() == min.posofphrase()) ? 0 : (256 - (((t.posofphrase() - min.posofphrase() ) << 8) / (max.posofphrase() - min.posofphrase()) )) << ranking.coeff_posofphrase)
+ ((max.posinphrase() == min.posinphrase()) ? 0 : (256 - (((t.posinphrase() - min.posinphrase() ) << 8) / (max.posinphrase() - min.posinphrase()) )) << ranking.coeff_posinphrase)
+ ((max.distance() == min.distance() ) ? 0 : (256 - (((t.distance() - min.distance() ) << 8) / (max.distance() - min.distance()) )) << ranking.coeff_worddistance)
+ ((max.virtualAge() == min.virtualAge()) ? 0 : (((t.virtualAge() - min.virtualAge() ) << 8) / (max.virtualAge() - min.virtualAge()) ) << ranking.coeff_date)
+ ((max.wordsintitle() == min.wordsintitle()) ? 0 : (((t.wordsintitle() - min.wordsintitle() ) << 8) / (max.wordsintitle() - min.wordsintitle()) ) << ranking.coeff_wordsintitle)
+ ((max.wordsintext() == min.wordsintext()) ? 0 : (((t.wordsintext() - min.wordsintext() ) << 8) / (max.wordsintext() - min.wordsintext()) ) << ranking.coeff_wordsintext)
+ ((max.phrasesintext() == min.phrasesintext()) ? 0 : (((t.phrasesintext()- min.phrasesintext() ) << 8) / (max.phrasesintext()- min.phrasesintext()) ) << ranking.coeff_phrasesintext)
+ ((max.llocal() == min.llocal()) ? 0 : (((t.llocal() - min.llocal() ) << 8) / (max.llocal() - min.llocal()) ) << ranking.coeff_llocal)
+ ((max.lother() == min.lother()) ? 0 : (((t.lother() - min.lother() ) << 8) / (max.lother() - min.lother()) ) << ranking.coeff_lother)
+ ((max.hitcount() == min.hitcount()) ? 0 : (((t.hitcount() - min.hitcount() ) << 8) / (max.hitcount() - min.hitcount()) ) << ranking.coeff_hitcount)
((256 - DigestURI.domLengthNormalized(t.urlhash())) << this.ranking.coeff_domlength)
+ ((this.ranking.coeff_ybr > 12) ? ((256 - (BlockRank.ranking(t.urlhash()) << 4)) << this.ranking.coeff_ybr) : 0)
+ ((this.max.urlcomps() == this.min.urlcomps() ) ? 0 : (256 - (((t.urlcomps() - this.min.urlcomps() ) << 8) / (this.max.urlcomps() - this.min.urlcomps()) )) << this.ranking.coeff_urlcomps)
+ ((this.max.urllength() == this.min.urllength() ) ? 0 : (256 - (((t.urllength() - this.min.urllength() ) << 8) / (this.max.urllength() - this.min.urllength()) )) << this.ranking.coeff_urllength)
+ ((maxmaxpos == minminpos) ? 0 : (256 - (((t.minposition() - minminpos) << 8) / (maxmaxpos - minminpos))) << this.ranking.coeff_posintext)
+ ((this.max.posofphrase() == this.min.posofphrase()) ? 0 : (256 - (((t.posofphrase() - this.min.posofphrase() ) << 8) / (this.max.posofphrase() - this.min.posofphrase()) )) << this.ranking.coeff_posofphrase)
+ ((this.max.posinphrase() == this.min.posinphrase()) ? 0 : (256 - (((t.posinphrase() - this.min.posinphrase() ) << 8) / (this.max.posinphrase() - this.min.posinphrase()) )) << this.ranking.coeff_posinphrase)
+ ((this.max.distance() == this.min.distance() ) ? 0 : (256 - (((t.distance() - this.min.distance() ) << 8) / (this.max.distance() - this.min.distance()) )) << this.ranking.coeff_worddistance)
+ ((this.max.virtualAge() == this.min.virtualAge()) ? 0 : (((t.virtualAge() - this.min.virtualAge() ) << 8) / (this.max.virtualAge() - this.min.virtualAge()) ) << this.ranking.coeff_date)
+ ((this.max.wordsintitle() == this.min.wordsintitle()) ? 0 : (((t.wordsintitle() - this.min.wordsintitle() ) << 8) / (this.max.wordsintitle() - this.min.wordsintitle()) ) << this.ranking.coeff_wordsintitle)
+ ((this.max.wordsintext() == this.min.wordsintext()) ? 0 : (((t.wordsintext() - this.min.wordsintext() ) << 8) / (this.max.wordsintext() - this.min.wordsintext()) ) << this.ranking.coeff_wordsintext)
+ ((this.max.phrasesintext() == this.min.phrasesintext()) ? 0 : (((t.phrasesintext()- this.min.phrasesintext() ) << 8) / (this.max.phrasesintext()- this.min.phrasesintext()) ) << this.ranking.coeff_phrasesintext)
+ ((this.max.llocal() == this.min.llocal()) ? 0 : (((t.llocal() - this.min.llocal() ) << 8) / (this.max.llocal() - this.min.llocal()) ) << this.ranking.coeff_llocal)
+ ((this.max.lother() == this.min.lother()) ? 0 : (((t.lother() - this.min.lother() ) << 8) / (this.max.lother() - this.min.lother()) ) << this.ranking.coeff_lother)
+ ((this.max.hitcount() == this.min.hitcount()) ? 0 : (((t.hitcount() - this.min.hitcount() ) << 8) / (this.max.hitcount() - this.min.hitcount()) ) << this.ranking.coeff_hitcount)
+ tf
+ ((ranking.coeff_authority > 12) ? (authority(t.hosthash()) << ranking.coeff_authority) : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_identifier)) ? 255 << ranking.coeff_appurl : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_title)) ? 255 << ranking.coeff_app_dc_title : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_creator)) ? 255 << ranking.coeff_app_dc_creator : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_subject)) ? 255 << ranking.coeff_app_dc_subject : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_description)) ? 255 << ranking.coeff_app_dc_description : 0)
+ ((flags.get(WordReferenceRow.flag_app_emphasized)) ? 255 << ranking.coeff_appemph : 0)
+ ((flags.get(Condenser.flag_cat_indexof)) ? 255 << ranking.coeff_catindexof : 0)
+ ((flags.get(Condenser.flag_cat_hasimage)) ? 255 << ranking.coeff_cathasimage : 0)
+ ((flags.get(Condenser.flag_cat_hasaudio)) ? 255 << ranking.coeff_cathasaudio : 0)
+ ((flags.get(Condenser.flag_cat_hasvideo)) ? 255 << ranking.coeff_cathasvideo : 0)
+ ((flags.get(Condenser.flag_cat_hasapp)) ? 255 << ranking.coeff_cathasapp : 0)
+ ((ByteBuffer.equals(t.language, this.language)) ? 255 << ranking.coeff_language : 0)
+ ((DigestURI.probablyRootURL(t.urlhash())) ? 15 << ranking.coeff_urllength : 0);
+ ((this.ranking.coeff_authority > 12) ? (authority(t.hosthash()) << this.ranking.coeff_authority) : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_identifier)) ? 255 << this.ranking.coeff_appurl : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_title)) ? 255 << this.ranking.coeff_app_dc_title : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_creator)) ? 255 << this.ranking.coeff_app_dc_creator : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_subject)) ? 255 << this.ranking.coeff_app_dc_subject : 0)
+ ((flags.get(WordReferenceRow.flag_app_dc_description)) ? 255 << this.ranking.coeff_app_dc_description : 0)
+ ((flags.get(WordReferenceRow.flag_app_emphasized)) ? 255 << this.ranking.coeff_appemph : 0)
+ ((flags.get(Condenser.flag_cat_indexof)) ? 255 << this.ranking.coeff_catindexof : 0)
+ ((flags.get(Condenser.flag_cat_hasimage)) ? 255 << this.ranking.coeff_cathasimage : 0)
+ ((flags.get(Condenser.flag_cat_hasaudio)) ? 255 << this.ranking.coeff_cathasaudio : 0)
+ ((flags.get(Condenser.flag_cat_hasvideo)) ? 255 << this.ranking.coeff_cathasvideo : 0)
+ ((flags.get(Condenser.flag_cat_hasapp)) ? 255 << this.ranking.coeff_cathasapp : 0)
+ ((ByteBuffer.equals(t.language, this.language)) ? 255 << this.ranking.coeff_language : 0)
+ ((DigestURI.probablyRootURL(t.urlhash())) ? 15 << this.ranking.coeff_urllength : 0);
//if (searchWords != null) r += (yacyURL.probablyWordURL(t.urlHash(), searchWords) != null) ? 256 << ranking.coeff_appurl : 0;

@ -9,7 +9,7 @@
// $LastChangedBy$
//
// LICENSE
//
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
@ -31,7 +31,6 @@ import java.util.Iterator;
import java.util.List;
import java.util.regex.Pattern;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.storage.ScoreMap;
@ -44,8 +43,8 @@ import net.yacy.kelondro.index.HandleSet;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.EventTracker;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.crawler.CrawlProfile;
import de.anomic.data.WorkTables;
import de.anomic.http.client.Cache;
@ -53,13 +52,13 @@ import de.anomic.yacy.yacySeedDB;
import de.anomic.yacy.graphics.ProfilingGraph;
public class ResultFetcher {
// input values
final RankingProcess rankingProcess; // ordered search results, grows dynamically as all the query threads enrich this container
QueryParams query;
private final yacySeedDB peers;
private final WorkTables workTables;
// result values
protected final LoaderDispatcher loader;
protected Worker[] workerThreads;
@ -70,10 +69,10 @@ public class ResultFetcher {
long snippetComputationAllTime;
int taketimeout;
private final boolean deleteIfSnippetFail;
public ResultFetcher(
final LoaderDispatcher loader,
RankingProcess rankedCache,
final RankingProcess rankedCache,
final QueryParams query,
final yacySeedDB peers,
final WorkTables workTables,
@ -87,18 +86,18 @@ public class ResultFetcher {
this.workTables = workTables;
this.taketimeout = taketimeout;
this.deleteIfSnippetFail = deleteIfSnippetFail;
this.urlRetrievalAllTime = 0;
this.snippetComputationAllTime = 0;
this.result = new WeakPriorityBlockingQueue<ResultEntry>(-1); // this is the result, enriched with snippets, ranked and ordered by ranking
this.images = new WeakPriorityBlockingQueue<MediaSnippet>(-1);
// snippets do not need to match with the complete query hashes,
// only with the query minus the stopwords which had not been used for the search
HandleSet filtered;
try {
filtered = HandleSet.joinConstructive(query.queryHashes, Switchboard.stopwordHashes);
} catch (RowSpaceExceededException e) {
} catch (final RowSpaceExceededException e) {
Log.logException(e);
filtered = new HandleSet(query.queryHashes.row().primaryKeyLength, query.queryHashes.comparator(), 0);
}
@ -106,81 +105,82 @@ public class ResultFetcher {
if (filtered != null && !filtered.isEmpty()) {
this.snippetFetchWordHashes.excludeDestructive(Switchboard.stopwordHashes);
}
// start worker threads to fetch urls and snippets
this.workerThreads = null;
deployWorker(Math.min(10, query.itemsPerPage), query.neededResults());
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(query.id(true), SearchEvent.Type.SNIPPETFETCH_START, ((this.workerThreads == null) ? "no" : this.workerThreads.length) + " online snippet fetch threads started", 0, 0), false);
}
public long getURLRetrievalTime() {
return this.urlRetrievalAllTime;
}
public long getSnippetComputationTime() {
return this.snippetComputationAllTime;
}
public ResultEntry oneResult(final int item, long timeout) {
public ResultEntry oneResult(final int item, final long timeout) {
// check if we already retrieved this item
// (happens if a search pages is accessed a second time)
long finishTime = System.currentTimeMillis() + timeout;
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(query.id(true), SearchEvent.Type.ONERESULT, "started, item = " + item + ", available = " + this.result.sizeAvailable(), 0, 0), false);
final long finishTime = System.currentTimeMillis() + timeout;
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(this.query.id(true), SearchEvent.Type.ONERESULT, "started, item = " + item + ", available = " + this.result.sizeAvailable(), 0, 0), false);
if (this.result.sizeAvailable() > item) {
// we have the wanted result already in the result array .. return that
ResultEntry re = this.result.element(item).getElement();
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(query.id(true), SearchEvent.Type.ONERESULT, "prefetched, item = " + item + ", available = " + this.result.sizeAvailable() + ": " + re.urlstring(), 0, 0), false);
final ResultEntry re = this.result.element(item).getElement();
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(this.query.id(true), SearchEvent.Type.ONERESULT, "prefetched, item = " + item + ", available = " + this.result.sizeAvailable() + ": " + re.urlstring(), 0, 0), false);
return re;
}
// deploy worker to get more results
deployWorker(Math.min(20, query.itemsPerPage), item + query.itemsPerPage);
// deploy worker to get more results
final int neededInclPrefetch = this.query.neededResults() + ((MemoryControl.available() > 100 * 1024 * 1024) ? this.query.itemsPerPage : 0);
deployWorker(Math.min(20, this.query.itemsPerPage), neededInclPrefetch);
// finally wait until enough results are there produced from the snippet fetch process
WeakPriorityBlockingQueue.Element<ResultEntry> entry = null;
while (System.currentTimeMillis() < finishTime) {
if (this.result.sizeAvailable() + this.rankingProcess.sizeQueue() <= item && !anyWorkerAlive() && this.rankingProcess.feedingIsFinished()) break;
try {entry = this.result.element(item, 50);} catch (InterruptedException e) {Log.logException(e);}
try {entry = this.result.element(item, 50);} catch (final InterruptedException e) {Log.logException(e);}
if (entry != null) break;
if (!anyWorkerAlive() && this.rankingProcess.sizeQueue() == 0 && this.rankingProcess.feedingIsFinished()) break;
}
// finally, if there is something, return the result
if (entry == null) {
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(query.id(true), SearchEvent.Type.ONERESULT, "not found, item = " + item + ", available = " + this.result.sizeAvailable(), 0, 0), false);
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(this.query.id(true), SearchEvent.Type.ONERESULT, "not found, item = " + item + ", available = " + this.result.sizeAvailable(), 0, 0), false);
return null;
}
ResultEntry re = entry.getElement();
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(query.id(true), SearchEvent.Type.ONERESULT, "retrieved, item = " + item + ", available = " + this.result.sizeAvailable() + ": " + re.urlstring(), 0, 0), false);
final ResultEntry re = entry.getElement();
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.searchEvent(this.query.id(true), SearchEvent.Type.ONERESULT, "retrieved, item = " + item + ", available = " + this.result.sizeAvailable() + ": " + re.urlstring(), 0, 0), false);
return re;
}
private int resultCounter = 0;
public ResultEntry nextResult() {
final ResultEntry re = oneResult(resultCounter, 1000);
resultCounter++;
final ResultEntry re = oneResult(this.resultCounter, 1000);
this.resultCounter++;
return re;
}
public MediaSnippet oneImage(final int item) {
// always look for a next object if there are way too less
if (this.images.sizeAvailable() <= item + 10) fillImagesCache();
// check if we already retrieved the item
if (this.images.sizeDrained() > item) return this.images.element(item).getElement();
// look again if there are not enough for presentation
while (this.images.sizeAvailable() <= item) {
if (fillImagesCache() == 0) break;
}
}
if (this.images.sizeAvailable() <= item) return null;
// now take the specific item from the image stack
return this.images.element(item).getElement();
}
private int fillImagesCache() {
ResultEntry result = nextResult();
final ResultEntry result = nextResult();
int c = 0;
if (result == null) return c;
// iterate over all images in the result
@ -188,28 +188,28 @@ public class ResultFetcher {
if (imagemedia != null) {
feedloop: for (final MediaSnippet ms: imagemedia) {
// check cache to see if the mime type of the image url is correct
ResponseHeader header = Cache.getResponseHeader(ms.href.hash());
final ResponseHeader header = Cache.getResponseHeader(ms.href.hash());
if (header != null) {
// this does not work for all urls since some of them may not be in the cache
if (header.mime().startsWith("text") || header.mime().startsWith("application")) continue feedloop;
}
images.put(new ReverseElement<MediaSnippet>(ms, ms.ranking)); // remove smallest in case of overflow
this.images.put(new ReverseElement<MediaSnippet>(ms, ms.ranking)); // remove smallest in case of overflow
c++;
//System.out.println("*** image " + UTF8.String(ms.href.hash()) + " images.size = " + images.size() + "/" + images.size());
}
}
return c;
}
public ArrayList<WeakPriorityBlockingQueue.Element<ResultEntry>> completeResults(final long waitingtime) {
final long timeout = System.currentTimeMillis() + waitingtime;
while ( result.sizeAvailable() < query.neededResults() &&
while ( this.result.sizeAvailable() < this.query.neededResults() &&
anyWorkerAlive() &&
System.currentTimeMillis() < timeout) {
try {Thread.sleep(20);} catch (final InterruptedException e) {}
//System.out.println("+++DEBUG-completeResults+++ sleeping " + 200);
}
return this.result.list(Math.min(query.neededResults(), this.result.sizeAvailable()));
return this.result.list(Math.min(this.query.neededResults(), this.result.sizeAvailable()));
}
public long postRanking(
@ -217,55 +217,59 @@ public class ResultFetcher {
final ScoreMap<String> topwords) {
long r = 0;
// for media search: prefer pages with many links
if (query.contentdom == ContentDomain.IMAGE) r += rentry.limage() << query.ranking.coeff_cathasimage;
if (query.contentdom == ContentDomain.AUDIO) r += rentry.laudio() << query.ranking.coeff_cathasaudio;
if (query.contentdom == ContentDomain.VIDEO) r += rentry.lvideo() << query.ranking.coeff_cathasvideo;
if (query.contentdom == ContentDomain.APP ) r += rentry.lapp() << query.ranking.coeff_cathasapp;
if (this.query.contentdom == ContentDomain.IMAGE) r += rentry.limage() << this.query.ranking.coeff_cathasimage;
if (this.query.contentdom == ContentDomain.AUDIO) r += rentry.laudio() << this.query.ranking.coeff_cathasaudio;
if (this.query.contentdom == ContentDomain.VIDEO) r += rentry.lvideo() << this.query.ranking.coeff_cathasvideo;
if (this.query.contentdom == ContentDomain.APP ) r += rentry.lapp() << this.query.ranking.coeff_cathasapp;
// prefer hit with 'prefer' pattern
if (query.prefer.matcher(rentry.url().toNormalform(true, true)).matches()) r += 256 << query.ranking.coeff_prefer;
if (query.prefer.matcher(rentry.title()).matches()) r += 256 << query.ranking.coeff_prefer;
if (this.query.prefer.matcher(rentry.url().toNormalform(true, true)).matches()) r += 256 << this.query.ranking.coeff_prefer;
if (this.query.prefer.matcher(rentry.title()).matches()) r += 256 << this.query.ranking.coeff_prefer;
// apply 'common-sense' heuristic using references
final String urlstring = rentry.url().toNormalform(true, true);
final String[] urlcomps = MultiProtocolURI.urlComps(urlstring);
final String[] descrcomps = MultiProtocolURI.splitpattern.split(rentry.title().toLowerCase());
int tc;
for (int j = 0; j < urlcomps.length; j++) {
tc = topwords.get(urlcomps[j]);
if (tc > 0) r += Math.max(1, tc) << query.ranking.coeff_urlcompintoplist;
for (final String urlcomp : urlcomps) {
tc = topwords.get(urlcomp);
if (tc > 0) r += Math.max(1, tc) << this.query.ranking.coeff_urlcompintoplist;
}
for (int j = 0; j < descrcomps.length; j++) {
tc = topwords.get(descrcomps[j]);
if (tc > 0) r += Math.max(1, tc) << query.ranking.coeff_descrcompintoplist;
for (final String descrcomp : descrcomps) {
tc = topwords.get(descrcomp);
if (tc > 0) r += Math.max(1, tc) << this.query.ranking.coeff_descrcompintoplist;
}
// apply query-in-result matching
final HandleSet urlcomph = Word.words2hashesHandles(urlcomps);
final HandleSet descrcomph = Word.words2hashesHandles(descrcomps);
final Iterator<byte[]> shi = query.queryHashes.iterator();
final Iterator<byte[]> shi = this.query.queryHashes.iterator();
byte[] queryhash;
while (shi.hasNext()) {
queryhash = shi.next();
if (urlcomph.has(queryhash)) r += 256 << query.ranking.coeff_appurl;
if (descrcomph.has(queryhash)) r += 256 << query.ranking.coeff_app_dc_title;
if (urlcomph.has(queryhash)) r += 256 << this.query.ranking.coeff_appurl;
if (descrcomph.has(queryhash)) r += 256 << this.query.ranking.coeff_app_dc_title;
}
return r;
}
public void deployWorker(int deployCount, final int neededResults) {
if (rankingProcess.feedingIsFinished() && rankingProcess.sizeQueue() == 0) return;
if (this.rankingProcess.feedingIsFinished() && this.rankingProcess.sizeQueue() == 0) return;
if (this.result.sizeAvailable() >= neededResults) return;
if (this.workerThreads == null) {
this.workerThreads = new Worker[deployCount];
synchronized(this.workerThreads) {
for (int i = 0; i < workerThreads.length; i++) {
Worker worker = new Worker(i, 10000, query.snippetCacheStrategy, query.snippetMatcher, neededResults);
for (int i = 0; i < this.workerThreads.length; i++) {
final Worker worker = new Worker(i, 10000, this.query.snippetCacheStrategy, this.query.snippetMatcher, neededResults);
worker.start();
this.workerThreads[i] = worker;
if (this.rankingProcess.feedingIsFinished() && this.rankingProcess.sizeQueue() == 0) break;
if (this.result.sizeAvailable() >= neededResults) break;
}
}
} else {
@ -275,38 +279,40 @@ public class ResultFetcher {
for (int i = 0; i < this.workerThreads.length; i++) {
if (deployCount <= 0) break;
if (this.workerThreads[i] == null || !this.workerThreads[i].isAlive()) {
Worker worker = new Worker(i, 10000, query.snippetCacheStrategy, query.snippetMatcher, neededResults);
final Worker worker = new Worker(i, 10000, this.query.snippetCacheStrategy, this.query.snippetMatcher, neededResults);
worker.start();
this.workerThreads[i] = worker;
deployCount--;
}
if (this.rankingProcess.feedingIsFinished() && this.rankingProcess.sizeQueue() == 0) break;
if (this.result.sizeAvailable() >= neededResults) break;
}
}
}
}
private boolean anyWorkerAlive() {
if (this.workerThreads == null) return false;
synchronized(this.workerThreads) {
for (int i = 0; i < this.workerThreads.length; i++) {
if ((this.workerThreads[i] != null) &&
(this.workerThreads[i].isAlive()) &&
(this.workerThreads[i].busytime() < 1000)) return true;
for (final Worker workerThread : this.workerThreads) {
if ((workerThread != null) &&
(workerThread.isAlive()) &&
(workerThread.busytime() < 1000)) return true;
}
}
return false;
}
protected class Worker extends Thread {
private final long timeout; // the date until this thread should try to work
private long lastLifeSign; // when the last time the run()-loop was executed
private final int id;
private final CrawlProfile.CacheStrategy cacheStrategy;
private final int neededResults;
private final Pattern snippetPattern;
public Worker(final int id, final long maxlifetime, CrawlProfile.CacheStrategy cacheStrategy, Pattern snippetPattern, int neededResults) {
public Worker(final int id, final long maxlifetime, final CrawlProfile.CacheStrategy cacheStrategy, final Pattern snippetPattern, final int neededResults) {
this.id = id;
this.cacheStrategy = cacheStrategy;
this.lastLifeSign = System.currentTimeMillis();
@ -321,7 +327,7 @@ public class ResultFetcher {
// start fetching urls and snippets
URIMetadataRow page;
//final int fetchAhead = snippetMode == 0 ? 0 : 10;
boolean nav_topics = query.navigators.equals("all") || query.navigators.indexOf("topics") >= 0;
final boolean nav_topics = ResultFetcher.this.query.navigators.equals("all") || ResultFetcher.this.query.navigators.indexOf("topics") >= 0;
try {
//System.out.println("DEPLOYED WORKER " + id + " FOR " + this.neededResults + " RESULTS, timeoutd = " + (this.timeout - System.currentTimeMillis()));
int loops = 0;
@ -329,52 +335,52 @@ public class ResultFetcher {
this.lastLifeSign = System.currentTimeMillis();
// check if we have enough
if (result.sizeAvailable() >= this.neededResults) {
//Log.logWarning("ResultFetcher", result.sizeAvailable() + " = result.sizeAvailable() >= this.neededResults = " + this.neededResults);
if (ResultFetcher.this.result.sizeAvailable() >= this.neededResults) {
//Log.logWarning("ResultFetcher", ResultFetcher.this.result.sizeAvailable() + " = result.sizeAvailable() >= this.neededResults = " + this.neededResults);
break;
}
// check if we can succeed if we try to take another url
if (rankingProcess.feedingIsFinished() && rankingProcess.sizeQueue() == 0) {
if (ResultFetcher.this.rankingProcess.feedingIsFinished() && ResultFetcher.this.rankingProcess.sizeQueue() == 0) {
//Log.logWarning("ResultFetcher", "rankingProcess.feedingIsFinished() && rankingProcess.sizeQueue() == 0");
break;
}
// get next entry
page = rankingProcess.takeURL(true, this.timeout - System.currentTimeMillis());
page = ResultFetcher.this.rankingProcess.takeURL(true, Math.min(100, this.timeout - System.currentTimeMillis()));
//if (page == null) page = rankedCache.takeURL(false, this.timeout - System.currentTimeMillis());
if (page == null) {
//System.out.println("page == null");
break; // no more available
}
if (query.filterfailurls && workTables.failURLsContains(page.hash())) continue;
if (ResultFetcher.this.query.filterfailurls && ResultFetcher.this.workTables.failURLsContains(page.hash())) continue;
loops++;
final ResultEntry resultEntry = fetchSnippet(page, cacheStrategy); // does not fetch snippets if snippetMode == 0
final ResultEntry resultEntry = fetchSnippet(page, this.cacheStrategy); // does not fetch snippets if snippetMode == 0
if (resultEntry == null) continue; // the entry had some problems, cannot be used
String rawLine = resultEntry.textSnippet() == null ? null : resultEntry.textSnippet().getLineRaw();
final String rawLine = resultEntry.textSnippet() == null ? null : resultEntry.textSnippet().getLineRaw();
//System.out.println("***SNIPPET*** raw='" + rawLine + "', pattern='" + this.snippetPattern.toString() + "'");
if (rawLine != null && !this.snippetPattern.matcher(rawLine).matches()) continue;
//if (result.contains(resultEntry)) continue;
urlRetrievalAllTime += resultEntry.dbRetrievalTime;
snippetComputationAllTime += resultEntry.snippetComputationTime;
ResultFetcher.this.urlRetrievalAllTime += resultEntry.dbRetrievalTime;
ResultFetcher.this.snippetComputationAllTime += resultEntry.snippetComputationTime;
// place the result to the result vector
// apply post-ranking
long ranking = Long.valueOf(rankingProcess.getOrder().cardinal(resultEntry.word()));
ranking += postRanking(resultEntry, rankingProcess.getTopicNavigator(10));
long ranking = Long.valueOf(ResultFetcher.this.rankingProcess.getOrder().cardinal(resultEntry.word()));
ranking += postRanking(resultEntry, ResultFetcher.this.rankingProcess.getTopicNavigator(10));
resultEntry.ranking = ranking;
result.put(new ReverseElement<ResultEntry>(resultEntry, ranking)); // remove smallest in case of overflow
if (nav_topics) rankingProcess.addTopics(resultEntry);
ResultFetcher.this.result.put(new ReverseElement<ResultEntry>(resultEntry, ranking)); // remove smallest in case of overflow
if (nav_topics) ResultFetcher.this.rankingProcess.addTopics(resultEntry);
}
//System.out.println("FINISHED WORKER " + id + " FOR " + this.neededResults + " RESULTS, loops = " + loops);
} catch (final Exception e) {
Log.logException(e);
}
Log.logInfo("SEARCH", "resultWorker thread " + id + " terminated");
Log.logInfo("SEARCH", "resultWorker thread " + this.id + " terminated");
}
/**
* calculate the time since the worker has had the latest activity
* @return time in milliseconds lasted since latest activity
@ -383,13 +389,13 @@ public class ResultFetcher {
return System.currentTimeMillis() - this.lastLifeSign;
}
}
protected ResultEntry fetchSnippet(final URIMetadataRow page, CrawlProfile.CacheStrategy cacheStrategy) {
protected ResultEntry fetchSnippet(final URIMetadataRow page, final CrawlProfile.CacheStrategy cacheStrategy) {
// Snippet Fetching can has 3 modes:
// 0 - do not fetch snippets
// 1 - fetch snippets offline only
// 2 - online snippet fetch
// load only urls if there was not yet a root url of that hash
// find the url entry
@ -397,66 +403,66 @@ public class ResultFetcher {
final URIMetadataRow.Components metadata = page.metadata();
if (metadata == null) return null;
final long dbRetrievalTime = System.currentTimeMillis() - startTime;
if (cacheStrategy == null) {
final TextSnippet snippet = new TextSnippet(
null,
metadata,
snippetFetchWordHashes,
this.snippetFetchWordHashes,
null,
((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof))),
((this.query.constraint != null) && (this.query.constraint.get(Condenser.flag_cat_indexof))),
220,
Integer.MAX_VALUE,
!query.isLocal());
return new ResultEntry(page, query.getSegment(), peers, snippet, null, dbRetrievalTime, 0); // result without snippet
!this.query.isLocal());
return new ResultEntry(page, this.query.getSegment(), this.peers, snippet, null, dbRetrievalTime, 0); // result without snippet
}
// load snippet
if (query.contentdom == ContentDomain.TEXT) {
if (this.query.contentdom == ContentDomain.TEXT) {
// attach text snippet
startTime = System.currentTimeMillis();
final TextSnippet snippet = new TextSnippet(
this.loader,
metadata,
snippetFetchWordHashes,
this.snippetFetchWordHashes,
cacheStrategy,
((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof))),
((this.query.constraint != null) && (this.query.constraint.get(Condenser.flag_cat_indexof))),
180,
Integer.MAX_VALUE,
!query.isLocal());
!this.query.isLocal());
final long snippetComputationTime = System.currentTimeMillis() - startTime;
Log.logInfo("SEARCH", "text snippet load time for " + metadata.url() + ": " + snippetComputationTime + ", " + (!snippet.getErrorCode().fail() ? "snippet found" : ("no snippet found (" + snippet.getError() + ")")));
if (!snippet.getErrorCode().fail()) {
// we loaded the file and found the snippet
return new ResultEntry(page, query.getSegment(), peers, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached
return new ResultEntry(page, this.query.getSegment(), this.peers, snippet, null, dbRetrievalTime, snippetComputationTime); // result with snippet attached
} else if (cacheStrategy.mustBeOffline()) {
// we did not demand online loading, therefore a failure does not mean that the missing snippet causes a rejection of this result
// this may happen during a remote search, because snippet loading is omitted to retrieve results faster
return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet
return new ResultEntry(page, this.query.getSegment(), this.peers, null, null, dbRetrievalTime, snippetComputationTime); // result without snippet
} else {
// problems with snippet fetch
String reason = "no text snippet; errorCode = " + snippet.getErrorCode();
if (deleteIfSnippetFail) this.workTables.failURLsRegisterMissingWord(query.getSegment().termIndex(), metadata.url(), query.queryHashes, reason);
final String reason = "no text snippet; errorCode = " + snippet.getErrorCode();
if (this.deleteIfSnippetFail) this.workTables.failURLsRegisterMissingWord(this.query.getSegment().termIndex(), metadata.url(), this.query.queryHashes, reason);
Log.logInfo("SEARCH", "sorted out url " + metadata.url().toNormalform(true, false) + " during search: " + reason);
return null;
}
} else {
// attach media information
startTime = System.currentTimeMillis();
final List<MediaSnippet> mediaSnippets = MediaSnippet.retrieveMediaSnippets(metadata.url(), snippetFetchWordHashes, query.contentdom, cacheStrategy, 6000, !query.isLocal());
final List<MediaSnippet> mediaSnippets = MediaSnippet.retrieveMediaSnippets(metadata.url(), this.snippetFetchWordHashes, this.query.contentdom, cacheStrategy, 6000, !this.query.isLocal());
final long snippetComputationTime = System.currentTimeMillis() - startTime;
Log.logInfo("SEARCH", "media snippet load time for " + metadata.url() + ": " + snippetComputationTime);
if (mediaSnippets != null && !mediaSnippets.isEmpty()) {
// found media snippets, return entry
return new ResultEntry(page, query.getSegment(), peers, null, mediaSnippets, dbRetrievalTime, snippetComputationTime);
return new ResultEntry(page, this.query.getSegment(), this.peers, null, mediaSnippets, dbRetrievalTime, snippetComputationTime);
} else if (cacheStrategy.mustBeOffline()) {
return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, snippetComputationTime);
return new ResultEntry(page, this.query.getSegment(), this.peers, null, null, dbRetrievalTime, snippetComputationTime);
} else {
// problems with snippet fetch
String reason = "no media snippet";
if (deleteIfSnippetFail) this.workTables.failURLsRegisterMissingWord(query.getSegment().termIndex(), metadata.url(), query.queryHashes, reason);
final String reason = "no media snippet";
if (this.deleteIfSnippetFail) this.workTables.failURLsRegisterMissingWord(this.query.getSegment().termIndex(), metadata.url(), this.query.queryHashes, reason);
Log.logInfo("SEARCH", "sorted out url " + metadata.url().toNormalform(true, false) + " during search: " + reason);
return null;
}

@ -9,7 +9,7 @@
// $LastChangedBy$
//
// LICENSE
//
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
@ -26,18 +26,17 @@
package de.anomic.search;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.SortedMap;
import java.util.concurrent.ConcurrentMap;
import java.util.concurrent.ConcurrentHashMap;
import java.util.concurrent.ConcurrentMap;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.repository.LoaderDispatcher;
import de.anomic.data.WorkTables;
import de.anomic.search.ResultFetcher.Worker;
import de.anomic.yacy.yacySeedDB;
public class SearchEventCache {
@ -50,57 +49,68 @@ public class SearchEventCache {
public static final long memlimitMedium = 200 * 1024 * 1024; // 100 MB
public static String lastEventID = "";
public static long cacheInsert = 0, cacheHit = 0, cacheMiss = 0, cacheDelete = 0;
public static int size() {
return lastEvents.size();
}
public static void put(String eventID, SearchEvent event) {
public static void put(final String eventID, final SearchEvent event) {
if (MemoryControl.shortStatus()) cleanupEvents(true);
lastEventID = eventID;
SearchEvent oldEvent = lastEvents.put(eventID, event);
final SearchEvent oldEvent = lastEvents.put(eventID, event);
if (oldEvent == null) cacheInsert++;
}
public static void cleanupEvents(boolean all) {
// remove old events in the event cache
if (MemoryControl.shortStatus()) all = true;
final List<SearchEvent> delete = new ArrayList<SearchEvent>();
// the less memory is there, the less time is acceptable for elements in the cache
long memx = MemoryControl.available();
long acceptTime = memx > memlimitHigh ? eventLifetimeBigMem : memx > memlimitMedium ? eventLifetimeMediumMem : eventLifetimeShortMem;
final long memx = MemoryControl.available();
final long acceptTime = memx > memlimitHigh ? eventLifetimeBigMem : memx > memlimitMedium ? eventLifetimeMediumMem : eventLifetimeShortMem;
Map.Entry<String, SearchEvent> event;
Iterator<Map.Entry<String, SearchEvent>> i = lastEvents.entrySet().iterator();
final Iterator<Map.Entry<String, SearchEvent>> i = lastEvents.entrySet().iterator();
while (i.hasNext()) {
event = i.next();
if (all || event.getValue().getEventTime() + acceptTime < System.currentTimeMillis()) {
delete.add(event.getValue());
i.remove();
cacheDelete++;
}
}
/*
* thread to remove the events;
* this process may take time because it applies index modifications
* in case of failed words
*/
new Thread(){
@Override
public void run() {
for (SearchEvent k: delete) {
//System.out.println("**** CLEANUP SEARCH EVENT **** incache = " + lastEvents.size() + ", word = " + k.getQuery().queryWords()[0]);
k.cleanup();
if (workerAlive(event.getValue())) {
event.getValue().cleanup();
} else {
i.remove();
cacheDelete++;
}
}
}.start();
}
}
public static SearchEvent getEvent(final String eventID) {
SearchEvent event = lastEvents.get(eventID);
final SearchEvent event = lastEvents.get(eventID);
if (event == null) cacheMiss++; else cacheHit++;
return event;
}
public static int countAliveThreads() {
int alive = 0;
for (final SearchEvent e: SearchEventCache.lastEvents.values()) {
if (workerAlive(e)) alive++;
}
return alive;
}
private static boolean workerAlive(final SearchEvent e) {
if (e == null || e.result() == null || e.result().workerThreads == null) return false;
for (final Worker w: e.result().workerThreads) if (w != null && w.isAlive()) return true;
return false;
}
private static SearchEvent dummyEvent = null;
private static SearchEvent getDummyEvent(final WorkTables workTables, final LoaderDispatcher loader, final Segment indexSegment) {
if (dummyEvent != null) return dummyEvent;
final QueryParams query = new QueryParams("", 0, null, indexSegment, new RankingProfile(ContentDomain.TEXT), "");
dummyEvent = new SearchEvent(query, null, workTables, null, false, loader, 0, 0, 0, 0, false);
return dummyEvent;
}
public static SearchEvent getEvent(
final QueryParams query,
final yacySeedDB peers,
@ -112,8 +122,8 @@ public class SearchEventCache {
final long remote_maxtime,
final int burstRobinsonPercent,
final int burstMultiwordPercent) {
String id = query.id(false);
final String id = query.id(false);
SearchEvent event = SearchEventCache.lastEvents.get(id);
if (event == null) cacheMiss++; else cacheHit++;
if (Switchboard.getSwitchboard() != null && !Switchboard.getSwitchboard().crawlQueues.noticeURL.isEmpty() && event != null && System.currentTimeMillis() - event.getEventTime() > 60000) {
@ -132,12 +142,40 @@ public class SearchEventCache {
}
}
if (event == null) {
// throttling in case of too many search requests
int waitcount = 0;
throttling : while (true) {
final int allowedThreads = (int) Math.max(1, MemoryControl.available() / (query.snippetCacheStrategy == null ? 10 : 100) / 1024 / 1024);
// make room if there are too many search events (they need a lot of RAM)
if (SearchEventCache.lastEvents.size() > allowedThreads) {
Log.logWarning("SearchEventCache", "throttling phase 1: " + SearchEventCache.lastEvents.size() + " in cache; " + countAliveThreads() + " alive; " + allowedThreads + " allowed");
cleanupEvents(false);
} else break throttling;
// if there are still some then delete just all
if (SearchEventCache.lastEvents.size() > allowedThreads) {
Log.logWarning("SearchEventCache", "throttling phase 2: " + SearchEventCache.lastEvents.size() + " in cache; " + countAliveThreads() + " alive; " + allowedThreads + " allowed");
cleanupEvents(true);
} else break throttling;
// now there might be still events left that are alive
if (countAliveThreads() < allowedThreads) break throttling;
// finally we just wait some time until we get access
Log.logWarning("SearchEventCache", "throttling phase 3: " + SearchEventCache.lastEvents.size() + " in cache; " + countAliveThreads() + " alive; " + allowedThreads + " allowed");
try { Thread.sleep(100); } catch (final InterruptedException e) { }
waitcount++;
if (waitcount >= 10) return getDummyEvent(workTables, loader, query.getSegment());
}
// check if there are too many other searches alive now
Log.logInfo("SearchEventCache", "getEvent: " + SearchEventCache.lastEvents.size() + " in cache; " + countAliveThreads() + " alive");
// start a new event
boolean delete = Switchboard.getSwitchboard() == null | Switchboard.getSwitchboard().getConfigBool("search.verify.delete", true);
final boolean delete = Switchboard.getSwitchboard() == null | Switchboard.getSwitchboard().getConfigBool("search.verify.delete", true);
event = new SearchEvent(query, peers, workTables, preselectedPeerHashes, generateAbstracts, loader, remote_maxcount, remote_maxtime, burstRobinsonPercent, burstMultiwordPercent, delete);
MemoryControl.request(100 * 1024 * 1024, false); // this may trigger a short memory status which causes a reducing of cache space of other threads
}
return event;
}
}

@ -11,12 +11,12 @@
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
@ -39,26 +39,26 @@ public class ConcurrentScoreMap<E> extends AbstractScoreMap<E> implements ScoreM
protected final ConcurrentHashMap<E, AtomicLong> map; // a mapping from a reference to the cluster key
private long gcount;
public ConcurrentScoreMap() {
map = new ConcurrentHashMap<E, AtomicLong>();
gcount = 0;
this.map = new ConcurrentHashMap<E, AtomicLong>();
this.gcount = 0;
}
public Iterator<E> iterator() {
return map.keySet().iterator();
return this.map.keySet().iterator();
}
public synchronized void clear() {
map.clear();
gcount = 0;
this.map.clear();
this.gcount = 0;
}
/**
* shrink the cluster to a demanded size
* @param maxsize
*/
public void shrinkToMaxSize(int maxsize) {
public void shrinkToMaxSize(final int maxsize) {
if (this.map.size() <= maxsize) return;
int minScore = getMinScore();
while (this.map.size() > maxsize) {
@ -66,84 +66,84 @@ public class ConcurrentScoreMap<E> extends AbstractScoreMap<E> implements ScoreM
shrinkToMinScore(minScore);
}
}
/**
* shrink the cluster in such a way that the smallest score is equal or greater than a given minScore
* @param minScore
*/
public void shrinkToMinScore(int minScore) {
Iterator<Map.Entry<E, AtomicLong>> i = this.map.entrySet().iterator();
public void shrinkToMinScore(final int minScore) {
final Iterator<Map.Entry<E, AtomicLong>> i = this.map.entrySet().iterator();
Map.Entry<E, AtomicLong> entry;
while (i.hasNext()) {
entry = i.next();
if (entry.getValue().intValue() < minScore) i.remove();
}
}
public long totalCount() {
return gcount;
return this.gcount;
}
public int size() {
return map.size();
return this.map.size();
}
public boolean sizeSmaller(int size) {
return map.size() < size;
public boolean sizeSmaller(final int size) {
return this.map.size() < size;
}
public boolean isEmpty() {
return map.isEmpty();
return this.map.isEmpty();
}
public void inc(final E obj) {
if (obj == null) return;
// use atomic operations
this.map.putIfAbsent(obj, new AtomicLong(0));
this.map.get(obj).incrementAndGet();
// increase overall counter
gcount++;
this.gcount++;
}
public void dec(final E obj) {
if (obj == null) return;
// use atomic operations
this.map.putIfAbsent(obj, new AtomicLong(0));
this.map.get(obj).decrementAndGet();
// increase overall counter
gcount--;
this.gcount--;
}
public void set(final E obj, final int newScore) {
if (obj == null) return;
// use atomic operations
this.map.putIfAbsent(obj, new AtomicLong(0));
this.map.get(obj).set(newScore);
// increase overall counter
gcount += newScore;
this.gcount += newScore;
}
public void inc(final E obj, final int incrementScore) {
if (obj == null) return;
// use atomic operations
this.map.putIfAbsent(obj, new AtomicLong(0));
this.map.get(obj).addAndGet(incrementScore);
// increase overall counter
gcount += incrementScore;
this.gcount += incrementScore;
}
public void dec(final E obj, final int decrementScore) {
inc(obj, -decrementScore);
}
public int delete(final E obj) {
// deletes entry and returns previous score
if (obj == null) return 0;
@ -151,41 +151,53 @@ public class ConcurrentScoreMap<E> extends AbstractScoreMap<E> implements ScoreM
if (score == null) return 0;
// decrease overall counter
gcount -= score.intValue();
this.gcount -= score.intValue();
return score.intValue();
}
public boolean containsKey(final E obj) {
return this.map.containsKey(obj);
}
public int get(final E obj) {
if (obj == null) return 0;
final AtomicLong score = this.map.get(obj);
if (score == null) return 0;
return score.intValue();
}
private int getMinScore() {
public int getMinScore() {
if (this.map.isEmpty()) return -1;
int minScore = Integer.MAX_VALUE;
for (Map.Entry<E, AtomicLong> entry: this.map.entrySet()) if (entry.getValue().intValue() < minScore) {
minScore = entry.getValue().intValue();
}
for (final Map.Entry<E, AtomicLong> entry : this.map.entrySet())
if (entry.getValue().intValue() < minScore) {
minScore = entry.getValue().intValue();
}
return minScore;
}
public int getMaxScore() {
if (this.map.isEmpty())
return -1;
int maxScore = Integer.MIN_VALUE;
for (final Map.Entry<E, AtomicLong> entry : this.map.entrySet())
if (entry.getValue().intValue() > maxScore) {
maxScore = entry.getValue().intValue();
}
return maxScore;
}
@Override
public String toString() {
return map.toString();
return this.map.toString();
}
public Iterator<E> keys(boolean up) {
public Iterator<E> keys(final boolean up) {
// re-organize entries
TreeMap<Integer, Set<E>> m = new TreeMap<Integer, Set<E>>();
final TreeMap<Integer, Set<E>> m = new TreeMap<Integer, Set<E>>();
Set<E> s;
Integer is;
for (Map.Entry<E, AtomicLong> entry: this.map.entrySet()) {
for (final Map.Entry<E, AtomicLong> entry: this.map.entrySet()) {
is = new Integer(entry.getValue().intValue());
s = m.get(is);
if (s == null) {
@ -196,18 +208,18 @@ public class ConcurrentScoreMap<E> extends AbstractScoreMap<E> implements ScoreM
s.add(entry.getKey());
}
}
// flatten result
List<E> l = new ArrayList<E>(m.size());
for (Set<E> f: m.values()) {
for (E e: f) l.add(e);
final List<E> l = new ArrayList<E>(m.size());
for (final Set<E> f: m.values()) {
for (final E e: f) l.add(e);
}
if (up) return l.iterator();
// optionally reverse list
List<E> r = new ArrayList<E>(l.size());
final List<E> r = new ArrayList<E>(l.size());
for (int i = l.size() - 1; i >= 0; i--) r.add(l.get(i));
return r.iterator();
}
}

@ -1,4 +1,4 @@
//Document.java
//Document.java
//------------------------
//part of YaCy
//(C) by Michael Peter Christen; mc@yacy.net
@ -62,7 +62,7 @@ import net.yacy.kelondro.util.FileUtils;
public class Document {
private final MultiProtocolURI source; // the source url
private final String mimeType; // mimeType as taken from http header
private final String charset; // the charset of the document
@ -83,10 +83,10 @@ public class Document {
private Map<String, String> emaillinks;
private MultiProtocolURI favicon;
private boolean resorted;
private Set<String> languages;
private boolean indexingDenied;
private float lon, lat;
private Object parserObject; // the source object that was used to create the Document
private final Set<String> languages;
private final boolean indexingDenied;
private final float lon, lat;
private final Object parserObject; // the source object that was used to create the Document
public Document(final MultiProtocolURI location, final String mimeType, final String charset,
final Object parserObject,
@ -98,7 +98,7 @@ public class Document {
final Map<MultiProtocolURI, Properties> anchors,
final Map<MultiProtocolURI, String> rss,
final Map<MultiProtocolURI, ImageEntry> images,
boolean indexingDenied) {
final boolean indexingDenied) {
this.source = location;
this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
this.charset = charset;
@ -126,11 +126,11 @@ public class Document {
this.indexingDenied = indexingDenied;
this.text = text == null ? new ByteArrayOutputStream() : text;
}
public Object getParserObject() {
return this.parserObject;
}
/**
* compute a set of languages that this document contains
* the language is not computed using a statistical analysis of the content, only from given metadata that came with the document
@ -141,13 +141,13 @@ public class Document {
public String dc_language() {
if (this.languages == null) return null;
if (this.languages.isEmpty()) return null;
if (this.languages.size() == 1) return languages.iterator().next();
if (this.languages.size() == 1) return this.languages.iterator().next();
if (this.languages.contains(this.source.language())) return this.source.language();
// now we are confused: the declared languages differ all from the TLD
// just pick one of the languages that we have
return languages.iterator().next();
return this.languages.iterator().next();
}
/*
DC according to rfc 5013
@ -167,17 +167,17 @@ dc_relation
dc_coverage
dc_rights
*/
public String dc_title() {
return (title == null) ? "" : title.toString();
return (this.title == null) ? "" : this.title.toString();
}
public void setTitle(String title) {
public void setTitle(final String title) {
this.title = new StringBuilder(title);
}
public String dc_creator() {
return (creator == null) ? "" : creator.toString();
return (this.creator == null) ? "" : this.creator.toString();
}
public String[] dc_subject() {
@ -189,61 +189,62 @@ dc_rights
s = (this.keywords.get(i)).trim();
if (s.length() > 0) hs.add(s.toLowerCase());
}
String[] t = new String[hs.size()];
final String[] t = new String[hs.size()];
int i = 0;
for (String u: hs) t[i++] = u;
for (final String u: hs) t[i++] = u;
return t;
}
public String dc_subject(final char separator) {
String[] t = dc_subject();
final String[] t = dc_subject();
if (t.length == 0) return "";
// generate a new list
final StringBuilder sb = new StringBuilder(t.length * 8);
for (String s: t) sb.append(s).append(separator);
for (final String s: t) sb.append(s).append(separator);
return sb.substring(0, sb.length() - 1);
}
public String dc_description() {
if (description == null)
if (this.description == null)
return dc_title();
return description.toString();
return this.description.toString();
}
public String dc_publisher() {
return this.publisher == null ? "" : this.publisher;
}
public String dc_format() {
return this.mimeType;
}
public String dc_identifier() {
return this.source.toNormalform(true, false);
}
public MultiProtocolURI dc_source() {
return this.source;
}
/**
* @return the supposed charset of this document or <code>null</code> if unknown
*/
public String getCharset() {
return this.charset;
}
public String[] getSectionTitles() {
if (sections == null) {
if (this.sections == null) {
return new String[] { dc_title() };
}
return sections.toArray(new String[this.sections.size()]);
return this.sections.toArray(new String[this.sections.size()]);
}
public InputStream getText() {
try {
if (this.text == null) return new ByteArrayInputStream(UTF8.getBytes(""));
if (this.text instanceof String) {
//return new StreamReader((String) this.text);
return new ByteArrayInputStream(UTF8.getBytes(((String) this.text)));
} else if (this.text instanceof InputStream) {
return (InputStream) this.text;
@ -261,7 +262,7 @@ dc_rights
}
return new ByteArrayInputStream(UTF8.getBytes(""));
}
public byte[] getTextBytes() {
try {
if (this.text == null) return new byte[0];
@ -283,7 +284,7 @@ dc_rights
}
return new byte[0];
}
public long getTextLength() {
try {
if (this.text == null) return -1;
@ -303,81 +304,81 @@ dc_rights
} catch (final Exception e) {
Log.logException(e);
}
return -1;
return -1;
}
public List<StringBuilder> getSentences(final boolean pre) {
if (this.text == null) return null;
final SentenceReader e = new SentenceReader(getText());
e.pre(pre);
List<StringBuilder> sentences = new ArrayList<StringBuilder>();
final List<StringBuilder> sentences = new ArrayList<StringBuilder>();
while (e.hasNext()) {
sentences.add(e.next());
}
return sentences;
}
public List<String> getKeywords() {
return this.keywords;
}
public Map<MultiProtocolURI, Properties> getAnchors() {
// returns all links embedded as anchors (clickeable entities)
// this is a url(String)/text(String) map
return anchors;
return this.anchors;
}
public Map<MultiProtocolURI, String> getRSS() {
// returns all links embedded as anchors (clickeable entities)
// this is a url(String)/text(String) map
return rss;
return this.rss;
}
// the next three methods provide a calculated view on the getAnchors/getImages:
public Map<MultiProtocolURI, String> getHyperlinks() {
// this is a subset of the getAnchor-set: only links to other hyperrefs
if (!resorted) resortLinks();
return hyperlinks;
if (!this.resorted) resortLinks();
return this.hyperlinks;
}
public Map<MultiProtocolURI, String> getAudiolinks() {
if (!resorted) resortLinks();
if (!this.resorted) resortLinks();
return this.audiolinks;
}
public Map<MultiProtocolURI, String> getVideolinks() {
if (!resorted) resortLinks();
if (!this.resorted) resortLinks();
return this.videolinks;
}
public Map<MultiProtocolURI, ImageEntry> getImages() {
// returns all links enbedded as pictures (visible in document)
// this resturns a htmlFilterImageEntry collection
if (!resorted) resortLinks();
return images;
if (!this.resorted) resortLinks();
return this.images;
}
public Map<MultiProtocolURI, String> getApplinks() {
if (!resorted) resortLinks();
if (!this.resorted) resortLinks();
return this.applinks;
}
public Map<String, String> getEmaillinks() {
// this is part of the getAnchor-set: only links to email addresses
if (!resorted) resortLinks();
return emaillinks;
if (!this.resorted) resortLinks();
return this.emaillinks;
}
public float lon() {
return this.lon;
}
public float lat() {
return this.lat;
}
private void resortLinks() {
if (this.resorted) return;
synchronized (this) {
@ -387,7 +388,7 @@ dc_rights
String u;
int extpos, qpos;
String ext = null;
String thishost = this.source.getHost();
final String thishost = this.source.getHost();
this.inboundlinks = new HashMap<MultiProtocolURI, String>();
this.outboundlinks = new HashMap<MultiProtocolURI, String>();
this.hyperlinks = new HashMap<MultiProtocolURI, String>();
@ -396,10 +397,10 @@ dc_rights
this.applinks = new HashMap<MultiProtocolURI, String>();
this.emaillinks = new HashMap<String, String>();
final Map<MultiProtocolURI, ImageEntry> collectedImages = new HashMap<MultiProtocolURI, ImageEntry>(); // this is a set that is collected now and joined later to the imagelinks
for (Map.Entry<MultiProtocolURI, ImageEntry> entry: collectedImages.entrySet()) {
for (final Map.Entry<MultiProtocolURI, ImageEntry> entry: collectedImages.entrySet()) {
if (entry.getKey().getHost().equals(thishost)) this.inboundlinks.put(entry.getKey(), "image"); else this.outboundlinks.put(entry.getKey(), "image");
}
for (Map.Entry<MultiProtocolURI, Properties> entry: anchors.entrySet()) {
for (final Map.Entry<MultiProtocolURI, Properties> entry: this.anchors.entrySet()) {
url = entry.getKey();
if (url == null) continue;
if ((thishost == null && url.getHost() == null) ||
@ -411,9 +412,9 @@ dc_rights
this.outboundlinks.put(url, "anchor");
}
u = url.toNormalform(true, false);
String name = entry.getValue().getProperty("name", "");
final String name = entry.getValue().getProperty("name", "");
if (u.startsWith("mailto:")) {
emaillinks.put(u.substring(7), name);
this.emaillinks.put(u.substring(7), name);
} else {
extpos = u.lastIndexOf('.');
if (extpos > 0) {
@ -427,39 +428,39 @@ dc_rights
if (Classification.isImageExtension(ext)) {
ContentScraper.addImage(collectedImages, new ImageEntry(url, name, -1, -1, -1));
}
else if (Classification.isAudioExtension(ext)) audiolinks.put(url, name);
else if (Classification.isVideoExtension(ext)) videolinks.put(url, name);
else if (Classification.isApplicationExtension(ext)) applinks.put(url, name);
else if (Classification.isAudioExtension(ext)) this.audiolinks.put(url, name);
else if (Classification.isVideoExtension(ext)) this.videolinks.put(url, name);
else if (Classification.isApplicationExtension(ext)) this.applinks.put(url, name);
}
}
// in any case we consider this as a link and let the parser decide if that link can be followed
hyperlinks.put(url, name);
this.hyperlinks.put(url, name);
}
}
// add image links that we collected from the anchors to the image map
ContentScraper.addAllImages(images, collectedImages);
ContentScraper.addAllImages(this.images, collectedImages);
// expand the hyperlinks:
// we add artificial hyperlinks to the hyperlink set
// that can be calculated from given hyperlinks and imagelinks
hyperlinks.putAll(allReflinks(images.values()));
hyperlinks.putAll(allReflinks(audiolinks.keySet()));
hyperlinks.putAll(allReflinks(videolinks.keySet()));
hyperlinks.putAll(allReflinks(applinks.keySet()));
this.hyperlinks.putAll(allReflinks(this.images.values()));
this.hyperlinks.putAll(allReflinks(this.audiolinks.keySet()));
this.hyperlinks.putAll(allReflinks(this.videolinks.keySet()));
this.hyperlinks.putAll(allReflinks(this.applinks.keySet()));
/*
hyperlinks.putAll(allSubpaths(hyperlinks.keySet()));
hyperlinks.putAll(allSubpaths(images.values()));
hyperlinks.putAll(allSubpaths(audiolinks.keySet()));
hyperlinks.putAll(allSubpaths(videolinks.keySet()));
hyperlinks.putAll(allSubpaths(applinks.keySet()));
*/
*/
// don't do this again
this.resorted = true;
}
}
public static Map<MultiProtocolURI, String> allSubpaths(final Collection<?> links) {
// links is either a Set of Strings (urls) or a Set of
// htmlFilterImageEntries
@ -506,7 +507,7 @@ dc_rights
}
return v;
}
public static Map<MultiProtocolURI, String> allReflinks(final Collection<?> links) {
// links is either a Set of Strings (with urls) or
// htmlFilterImageEntries
@ -556,95 +557,95 @@ dc_rights
}
return v;
}
public void addSubDocuments(final Document[] docs) throws IOException {
for (Document doc: docs) {
for (final Document doc: docs) {
this.sections.addAll(Arrays.asList(doc.getSectionTitles()));
if (this.title.length() > 0) this.title.append('\n');
this.title.append(doc.dc_title());
this.keywords.addAll(doc.getKeywords());
if (this.description.length() > 0) this.description.append('\n');
this.description.append(doc.dc_description());
if (!(this.text instanceof ByteArrayOutputStream)) {
this.text = new ByteArrayOutputStream();
}
FileUtils.copy(doc.getText(), (ByteArrayOutputStream) this.text);
anchors.putAll(doc.getAnchors());
rss.putAll(doc.getRSS());
ContentScraper.addAllImages(images, doc.getImages());
this.anchors.putAll(doc.getAnchors());
this.rss.putAll(doc.getRSS());
ContentScraper.addAllImages(this.images, doc.getImages());
}
}
/**
* @return the {@link URL} to the favicon that belongs to the document
*/
public MultiProtocolURI getFavicon() {
return this.favicon;
}
/**
* @param faviconURL the {@link URL} to the favicon that belongs to the document
*/
public void setFavicon(final MultiProtocolURI faviconURL) {
this.favicon = faviconURL;
}
public int inboundLinkCount() {
if (this.inboundlinks == null) resortLinks();
return (this.inboundlinks == null) ? 0 : this.inboundlinks.size();
}
public int outboundLinkCount() {
if (this.outboundlinks == null) resortLinks();
return (this.outboundlinks == null) ? 0 : this.outboundlinks.size();
}
public Set<MultiProtocolURI> inboundLinks() {
if (this.inboundlinks == null) resortLinks();
return (this.inboundlinks == null) ? null : this.inboundlinks.keySet();
}
public Set<MultiProtocolURI> outboundLinks() {
if (this.outboundlinks == null) resortLinks();
return (this.outboundlinks == null) ? null : this.outboundlinks.keySet();
}
public boolean indexingDenied() {
return this.indexingDenied;
}
public void writeXML(final Writer os, final Date date) throws IOException {
os.write("<record>\n");
String title = this.dc_title();
final String title = dc_title();
if (title != null && title.length() > 0) os.write("<dc:title><![CDATA[" + title + "]]></dc:title>\n");
os.write("<dc:identifier>" + this.dc_identifier() + "</dc:identifier>\n");
String creator = this.dc_creator();
os.write("<dc:identifier>" + dc_identifier() + "</dc:identifier>\n");
final String creator = dc_creator();
if (creator != null && creator.length() > 0) os.write("<dc:creator><![CDATA[" + creator + "]]></dc:creator>\n");
String publisher = this.dc_publisher();
final String publisher = dc_publisher();
if (publisher != null && publisher.length() > 0) os.write("<dc:publisher><![CDATA[" + publisher + "]]></dc:publisher>\n");
String subject = this.dc_subject(';');
final String subject = this.dc_subject(';');
if (subject != null && subject.length() > 0) os.write("<dc:subject><![CDATA[" + subject + "]]></dc:subject>\n");
if (this.text != null) {
os.write("<dc:description><![CDATA[");
byte[] buffer = new byte[1000];
final byte[] buffer = new byte[1000];
int c = 0;
InputStream is = this.getText();
final InputStream is = getText();
while ((c = is.read(buffer)) > 0) os.write(UTF8.String(buffer, 0, c));
is.close();
os.write("]]></dc:description>\n");
}
String language = this.dc_language();
if (language != null && language.length() > 0) os.write("<dc:language>" + this.dc_language() + "</dc:language>\n");
final String language = dc_language();
if (language != null && language.length() > 0) os.write("<dc:language>" + dc_language() + "</dc:language>\n");
os.write("<dc:date>" + ISO8601Formatter.FORMATTER.format(date) + "</dc:date>\n");
if (this.lon != 0.0f && this.lat != 0.0f) os.write("<geo:Point><geo:long>" + this.lon +"</geo:long><geo:lat>" + this.lat + "</geo:lat></geo:Point>\n");
os.write("</record>\n");
}
@Override
public String toString() {
final ByteArrayOutputStream baos = new ByteArrayOutputStream();
@ -653,31 +654,31 @@ dc_rights
writeXML(osw, new Date());
osw.close();
return UTF8.String(baos.toByteArray());
} catch (UnsupportedEncodingException e1) {
} catch (final UnsupportedEncodingException e1) {
return "";
} catch (IOException e) {
} catch (final IOException e) {
return "";
}
}
public void close() {
if (this.text == null) return;
// try close the output stream
if (this.text instanceof InputStream) try {
((InputStream) this.text).close();
} catch (final Exception e) {} finally {
this.text = null;
}
// delete the temp file
if (this.text instanceof File) try {
FileUtils.deletedelete((File) this.text);
if (this.text instanceof File) try {
FileUtils.deletedelete((File) this.text);
} catch (final Exception e) {} finally {
this.text = null;
}
}
/**
* merge documents: a helper method for all parsers that return multiple documents
* @param docs
@ -688,7 +689,7 @@ dc_rights
{
if (docs == null || docs.length == 0) return null;
if (docs.length == 1) return docs[0];
long docTextLength = 0;
final ByteBuffer content = new ByteBuffer();
final StringBuilder authors = new StringBuilder(80);
@ -702,40 +703,40 @@ dc_rights
final Map<MultiProtocolURI, String> rss = new HashMap<MultiProtocolURI, String>();
final Map<MultiProtocolURI, ImageEntry> images = new HashMap<MultiProtocolURI, ImageEntry>();
float lon = 0.0f, lat = 0.0f;
for (Document doc: docs) {
String author = doc.dc_creator();
for (final Document doc: docs) {
final String author = doc.dc_creator();
if (author.length() > 0) {
if (authors.length() > 0) authors.append(",");
subjects.append(author);
}
String publisher = doc.dc_publisher();
final String publisher = doc.dc_publisher();
if (publisher.length() > 0) {
if (publishers.length() > 0) publishers.append(",");
publishers.append(publisher);
}
String subject = doc.dc_subject(',');
final String subject = doc.dc_subject(',');
if (subject.length() > 0) {
if (subjects.length() > 0) subjects.append(",");
subjects.append(subject);
}
if (title.length() > 0) title.append("\n");
title.append(doc.dc_title());
sectionTitles.addAll(Arrays.asList(doc.getSectionTitles()));
if (description.length() > 0) description.append("\n");
description.append(doc.dc_description());
if (doc.getTextLength() > 0) {
if (docTextLength > 0) content.write('\n');
try {
docTextLength += FileUtils.copy(doc.getText(), content);
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
}
}
@ -763,7 +764,7 @@ dc_rights
images,
false);
}
public static Map<MultiProtocolURI, String> getHyperlinks(final Document[] documents) {
final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
for (final Document d: documents) {
@ -771,16 +772,16 @@ dc_rights
}
return result;
}
public static Map<MultiProtocolURI, String> getImagelinks(final Document[] documents) {
final Map<MultiProtocolURI, String> result = new HashMap<MultiProtocolURI, String>();
for (final Document d: documents) {
for (ImageEntry imageReference : d.getImages().values()) {
for (final ImageEntry imageReference : d.getImages().values()) {
result.put(imageReference.url(), imageReference.alt());
}
}
return result;
}
}

@ -7,12 +7,12 @@
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
@ -41,6 +41,7 @@ import net.yacy.document.parser.docParser;
import net.yacy.document.parser.genericParser;
import net.yacy.document.parser.gzipParser;
import net.yacy.document.parser.htmlParser;
import net.yacy.document.parser.mmParser;
import net.yacy.document.parser.odtParser;
import net.yacy.document.parser.ooxmlParser;
import net.yacy.document.parser.pdfParser;
@ -49,6 +50,7 @@ import net.yacy.document.parser.psParser;
import net.yacy.document.parser.rssParser;
import net.yacy.document.parser.rtfParser;
import net.yacy.document.parser.sevenzipParser;
import net.yacy.document.parser.sidAudioParser;
import net.yacy.document.parser.swfParser;
import net.yacy.document.parser.tarParser;
import net.yacy.document.parser.torrentParser;
@ -57,10 +59,9 @@ import net.yacy.document.parser.vsdParser;
import net.yacy.document.parser.xlsParser;
import net.yacy.document.parser.zipParser;
import net.yacy.document.parser.images.genericImageParser;
import net.yacy.document.parser.mmParser;
import net.yacy.document.parser.sidAudioParser;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl;
public final class TextParser {
@ -73,7 +74,7 @@ public final class TextParser {
private static final Map<String, String> ext2mime = new ConcurrentHashMap<String, String>();
private static final Map<String, Object> denyMime = new ConcurrentHashMap<String, Object>();
private static final Map<String, Object> denyExtensionx = new ConcurrentHashMap<String, Object>();
static {
initParser(new bzipParser());
initParser(new csvParser());
@ -99,43 +100,43 @@ public final class TextParser {
initParser(new xlsParser());
initParser(new zipParser());
}
public static Set<Parser> parsers() {
Set<Parser> c = new HashSet<Parser>();
final Set<Parser> c = new HashSet<Parser>();
c.addAll(ext2parser.values());
c.addAll(mime2parser.values());
return c;
}
private static void initParser(Parser parser) {
private static void initParser(final Parser parser) {
String prototypeMime = null;
for (String mime: parser.supportedMimeTypes()) {
for (final String mime: parser.supportedMimeTypes()) {
// process the mime types
final String mimeType = normalizeMimeType(mime);
if (prototypeMime == null) prototypeMime = mimeType;
Parser p0 = mime2parser.get(mimeType);
final Parser p0 = mime2parser.get(mimeType);
if (p0 != null) log.logSevere("parser for mime '" + mimeType + "' was set to '" + p0.getName() + "', overwriting with new parser '" + parser.getName() + "'.");
mime2parser.put(mimeType, parser);
Log.logInfo("PARSER", "Parser for mime type '" + mimeType + "': " + parser.getName());
}
if (prototypeMime != null) for (String ext: parser.supportedExtensions()) {
ext = ext.toLowerCase();
String s = ext2mime.get(ext);
final String s = ext2mime.get(ext);
if (s != null) log.logSevere("parser for extension '" + ext + "' was set to mime '" + s + "', overwriting with new mime '" + prototypeMime + "'.");
ext2mime.put(ext, prototypeMime);
}
for (String ext: parser.supportedExtensions()) {
// process the extensions
ext = ext.toLowerCase();
Parser p0 = ext2parser.get(ext);
final Parser p0 = ext2parser.get(ext);
if (p0 != null) log.logSevere("parser for extension '" + ext + "' was set to '" + p0.getName() + "', overwriting with new parser '" + parser.getName() + "'.");
ext2parser.put(ext, parser);
Log.logInfo("PARSER", "Parser for extension '" + ext + "': " + parser.getName());
}
}
public static Document[] parseSource(
final MultiProtocolURI location,
final String mimeType,
@ -162,19 +163,19 @@ public final class TextParser {
} finally {
if (sourceStream != null) try { sourceStream.close(); } catch (final Exception ex) {}
}
for (Document d: docs) { assert d.getText() != null; } // verify docs
for (final Document d: docs) { assert d.getText() != null; } // verify docs
return docs;
}
public static Document[] parseSource(
final MultiProtocolURI location,
String mimeType,
final String mimeType,
final String charset,
final byte[] content
) throws Parser.Failure {
return parseSource(location, mimeType, charset, content.length, new ByteArrayInputStream(content));
}
public static Document[] parseSource(
final MultiProtocolURI location,
String mimeType,
@ -187,39 +188,39 @@ public final class TextParser {
List<Parser> idioms = null;
try {
idioms = parsers(location, mimeType);
} catch (Parser.Failure e) {
} catch (final Parser.Failure e) {
final String errorMsg = "Parser Failure for extension '" + location.getFileExtension() + "' or mimetype '" + mimeType + "': " + e.getMessage();
log.logWarning(errorMsg);
throw new Parser.Failure(errorMsg, location);
}
assert !idioms.isEmpty() : "no parsers applied for url " + location.toNormalform(true, false);
// if we do not have more than one parser or the content size is over MaxInt
// then we use only one stream-oriented parser.
if (idioms.size() == 1 || contentLength > Integer.MAX_VALUE) {
// use a specific stream-oriented parser
Document[] docs = parseSource(location, mimeType, idioms.get(0), charset, contentLength, sourceStream);
for (Document d: docs) { assert d.getText() != null; } // verify docs
final Document[] docs = parseSource(location, mimeType, idioms.get(0), charset, contentLength, sourceStream);
for (final Document d: docs) { assert d.getText() != null; } // verify docs
return docs;
}
// in case that we know more parsers we first transform the content into a byte[] and use that as base
// for a number of different parse attempts.
byte[] b = null;
try {
b = FileUtils.read(sourceStream, (int) contentLength);
} catch (IOException e) {
} catch (final IOException e) {
throw new Parser.Failure(e.getMessage(), location);
}
Document[] docs = parseSource(location, mimeType, idioms, charset, b);
for (Document d: docs) { assert d.getText() != null; } // verify docs
final Document[] docs = parseSource(location, mimeType, idioms, charset, b);
for (final Document d: docs) { assert d.getText() != null; } // verify docs
return docs;
}
private static Document[] parseSource(
final MultiProtocolURI location,
String mimeType,
Parser parser,
final String mimeType,
final Parser parser,
final String charset,
final long contentLength,
final InputStream sourceStream
@ -231,10 +232,10 @@ public final class TextParser {
if (log.isFine()) log.logInfo("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "'.");
try {
Document[] docs = parser.parse(location, mimeType, documentCharset, sourceStream);
for (Document d: docs) { assert d.getText() != null; } // verify docs
final Document[] docs = parser.parse(location, mimeType, documentCharset, sourceStream);
for (final Document d: docs) { assert d.getText() != null; } // verify docs
return docs;
} catch (Exception e) {
} catch (final Exception e) {
throw new Parser.Failure("parser failed: " + parser.getName(), location);
}
}
@ -252,20 +253,22 @@ public final class TextParser {
assert !parsers.isEmpty();
Document[] docs = null;
HashMap<Parser, Parser.Failure> failedParser = new HashMap<Parser, Parser.Failure>();
for (Parser parser: parsers) {
try {
docs = parser.parse(location, mimeType, documentCharset, new ByteArrayInputStream(sourceArray));
} catch (Parser.Failure e) {
failedParser.put(parser, e);
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
} catch (Exception e) {
failedParser.put(parser, new Parser.Failure(e.getMessage(), location));
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
final HashMap<Parser, Parser.Failure> failedParser = new HashMap<Parser, Parser.Failure>();
if (MemoryControl.request(sourceArray.length * 2, false)) {
for (final Parser parser: parsers) {
try {
docs = parser.parse(location, mimeType, documentCharset, new ByteArrayInputStream(sourceArray));
} catch (final Parser.Failure e) {
failedParser.put(parser, e);
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
} catch (final Exception e) {
failedParser.put(parser, new Parser.Failure(e.getMessage(), location));
//log.logWarning("tried parser '" + parser.getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + e.getMessage(), e);
}
if (docs != null) break;
}
if (docs != null) break;
}
if (docs == null) {
if (failedParser.isEmpty()) {
final String errorMsg = "Parsing content with file extension '" + location.getFileExtension() + "' and mimetype '" + mimeType + "' failed.";
@ -273,34 +276,34 @@ public final class TextParser {
throw new Parser.Failure(errorMsg, location);
} else {
String failedParsers = "";
for (Map.Entry<Parser, Parser.Failure> error: failedParser.entrySet()) {
for (final Map.Entry<Parser, Parser.Failure> error: failedParser.entrySet()) {
log.logWarning("tried parser '" + error.getKey().getName() + "' to parse " + location.toNormalform(true, false) + " but failed: " + error.getValue().getMessage(), error.getValue());
failedParsers += error.getKey().getName() + " ";
}
throw new Parser.Failure("All parser failed: " + failedParsers, location);
}
}
for (Document d: docs) { assert d.getText() != null : "mimeType = " + mimeType; } // verify docs
for (final Document d: docs) { assert d.getText() != null : "mimeType = " + mimeType; } // verify docs
return docs;
}
/**
* check if the parser supports the given content.
* @param url
* @param mimeType
* @return returns null if the content is supported. If the content is not supported, return a error string.
*/
public static String supports(final MultiProtocolURI url, String mimeType) {
public static String supports(final MultiProtocolURI url, final String mimeType) {
try {
// try to get a parser. If this works, we don't need the parser itself, we just return null to show that everything is ok.
List<Parser> idioms = parsers(url, mimeType);
final List<Parser> idioms = parsers(url, mimeType);
return (idioms == null || idioms.isEmpty() || (idioms.size() == 1 && idioms.get(0).getName().equals(genericIdiom.getName()))) ? "no parser found" : null;
} catch (Parser.Failure e) {
} catch (final Parser.Failure e) {
// in case that a parser is not available, return a error string describing the problem.
return e.getMessage();
}
}
/**
* find a parser for a given url and mime type
* because mime types returned by web severs are sometimes wrong, we also compute the mime type again
@ -315,8 +318,8 @@ public final class TextParser {
* @throws Parser.Failure
*/
private static List<Parser> parsers(final MultiProtocolURI url, String mimeType1) throws Parser.Failure {
List<Parser> idioms = new ArrayList<Parser>(2);
final List<Parser> idioms = new ArrayList<Parser>(2);
// check extension
String ext = url.getFileExtension();
Parser idiom;
@ -326,7 +329,7 @@ public final class TextParser {
idiom = ext2parser.get(ext);
if (idiom != null) idioms.add(idiom);
}
// check given mime type
if (mimeType1 != null) {
mimeType1 = normalizeMimeType(mimeType1);
@ -334,15 +337,15 @@ public final class TextParser {
idiom = mime2parser.get(mimeType1);
if (idiom != null && !idioms.contains(idiom)) idioms.add(idiom);
}
// check mime type computed from extension
String mimeType2 = ext2mime.get(ext);
final String mimeType2 = ext2mime.get(ext);
if (mimeType2 != null && (idiom = mime2parser.get(mimeType2)) != null && !idioms.contains(idiom)) idioms.add(idiom);
// always add the generic parser
idioms.add(genericIdiom);
//if (idioms.isEmpty()) throw new Parser.Failure("no parser found for extension '" + ext + "' and mime type '" + mimeType1 + "'", url);
return idioms;
}
public static String supportsMime(String mimeType) {
@ -354,67 +357,67 @@ public final class TextParser {
}
public static String supportsExtension(final MultiProtocolURI url) {
String ext = url.getFileExtension().toLowerCase();
final String ext = url.getFileExtension().toLowerCase();
if (ext == null || ext.length() == 0) return null;
if (denyExtensionx.containsKey(ext)) return "file extension '" + ext + "' is denied (2)";
String mimeType = ext2mime.get(ext);
final String mimeType = ext2mime.get(ext);
if (mimeType == null) return "no parser available";
Parser idiom = mime2parser.get(mimeType);
final Parser idiom = mime2parser.get(mimeType);
assert idiom != null;
if (idiom == null) return "no parser available (internal error!)";
return null;
}
public static String mimeOf(MultiProtocolURI url) {
public static String mimeOf(final MultiProtocolURI url) {
return mimeOf(url.getFileExtension());
}
public static String mimeOf(String ext) {
public static String mimeOf(final String ext) {
return ext2mime.get(ext.toLowerCase());
}
private static String normalizeMimeType(String mimeType) {
if (mimeType == null) return "application/octet-stream";
mimeType = mimeType.toLowerCase();
final int pos = mimeType.indexOf(';');
return ((pos < 0) ? mimeType.trim() : mimeType.substring(0, pos).trim());
}
public static void setDenyMime(String denyList) {
public static void setDenyMime(final String denyList) {
denyMime.clear();
String n;
for (String s: denyList.split(",")) {
for (final String s: denyList.split(",")) {
n = normalizeMimeType(s);
if (n != null && n.length() > 0) denyMime.put(n, v);
}
}
public static String getDenyMime() {
String s = "";
for (String d: denyMime.keySet()) s += d + ",";
for (final String d: denyMime.keySet()) s += d + ",";
if (s.length() > 0) s = s.substring(0, s.length() - 1);
return s;
}
public static void grantMime(String mime, boolean grant) {
String n = normalizeMimeType(mime);
public static void grantMime(final String mime, final boolean grant) {
final String n = normalizeMimeType(mime);
if (n == null || n.length() == 0) return;
if (grant) denyMime.remove(n); else denyMime.put(n, v);
}
public static void setDenyExtension(String denyList) {
public static void setDenyExtension(final String denyList) {
denyExtensionx.clear();
for (String s: denyList.split(",")) denyExtensionx.put(s, v);
for (final String s: denyList.split(",")) denyExtensionx.put(s, v);
}
public static String getDenyExtension() {
String s = "";
for (String d: denyExtensionx.keySet()) s += d + ",";
for (final String d: denyExtensionx.keySet()) s += d + ",";
s = s.substring(0, s.length() - 1);
return s;
}
public static void grantExtension(String ext, boolean grant) {
public static void grantExtension(final String ext, final boolean grant) {
if (grant) denyExtensionx.remove(ext); else denyExtensionx.put(ext, v);
}

@ -58,15 +58,15 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private final char degree = '\u00B0';
private final char[] minuteCharsHTML = "&#039;".toCharArray();
// statics: for initialization of the HTMLFilterAbstractScraper
private static final Set<String> linkTags0 = new HashSet<String>(9,0.99f);
private static final Set<String> linkTags1 = new HashSet<String>(7,0.99f);
public enum TagType {
singleton, pair;
}
public enum Tag {
html(TagType.singleton), // scraped as singleton to get attached properties like 'lang'
body(TagType.singleton), // scraped as singleton to get attached properties like 'class'
@ -96,14 +96,14 @@ public class ContentScraper extends AbstractScraper implements Scraper {
script(TagType.pair);
public TagType type;
private Tag(TagType type) {
private Tag(final TagType type) {
this.type = type;
}
}
// all these tags must be given in lowercase, because the tags from the files are compared in lowercase
static {
for (Tag tag: Tag.values()) {
for (final Tag tag: Tag.values()) {
if (tag.type == TagType.singleton) linkTags0.add(tag.name());
if (tag.type == TagType.pair) linkTags1.add(tag.name());
}
@ -112,33 +112,33 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// class variables: collectors for links
private Map<MultiProtocolURI, Properties> anchors;
private Map<MultiProtocolURI, String> rss, css;
private Set<MultiProtocolURI> script, frames, iframes;
private final Map<MultiProtocolURI, String> rss, css;
private final Set<MultiProtocolURI> script, frames, iframes;
private Map<MultiProtocolURI, ImageEntry> images; // urlhash/image relation
private final Map<String, String> metas;
private String title;
//private String headline;
private List<String>[] headlines;
private ClusteredScoreMap<String> bold, italic;
private List<String> li;
private final ClusteredScoreMap<String> bold, italic;
private final List<String> li;
private CharBuffer content;
private final EventListenerList htmlFilterEventListeners;
private float lon, lat;
/**
* {@link MultiProtocolURI} to the favicon that belongs to the document
*/
private MultiProtocolURI favicon;
/**
* The document root {@link MultiProtocolURI}
* The document root {@link MultiProtocolURI}
*/
private MultiProtocolURI root;
/**
* evaluation scores: count appearance of specific attributes
*/
private Evaluation evaluationScores;
private final Evaluation evaluationScores;
@SuppressWarnings("unchecked")
public ContentScraper(final MultiProtocolURI root) {
@ -157,7 +157,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.script = new HashSet<MultiProtocolURI>();
this.title = "";
this.headlines = new ArrayList[6];
for (int i = 0; i < this.headlines.length; i++) headlines[i] = new ArrayList<String>();
for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList<String>();
this.bold = new ClusteredScoreMap<String>();
this.italic = new ClusteredScoreMap<String>();
this.li = new ArrayList<String>();
@ -167,28 +167,28 @@ public class ContentScraper extends AbstractScraper implements Scraper {
this.lat = 0.0f;
this.evaluationScores.match(Element.url, root.toNormalform(false, false));
}
public void scrapeText(final char[] newtext, final String insideTag) {
// System.out.println("SCRAPE: " + UTF8.String(newtext));
int p, pl, q, s = 0;
// match evaluation pattern
this.evaluationScores.match(Element.text, newtext);
// try to find location information in text
// Opencaching:
// <nobr>N 50o 05.453&#039;</nobr><nobr>E 008o 30.191&#039;</nobr>
// N 52o 28.025 E 013o 20.299
location: while (s < newtext.length) {
pl = 1;
p = CharBuffer.indexOf(newtext, s, degree);
p = CharBuffer.indexOf(newtext, s, this.degree);
if (p < 0) {p = CharBuffer.indexOf(newtext, s, "&deg;".toCharArray()); if (p >= 0) pl = 5;}
if (p < 0) break location;
q = CharBuffer.indexOf(newtext, p + pl, minuteCharsHTML);
q = CharBuffer.indexOf(newtext, p + pl, this.minuteCharsHTML);
if (q < 0) q = CharBuffer.indexOf(newtext, p + pl, "'".toCharArray());
if (q < 0) q = CharBuffer.indexOf(newtext, p + pl, " E".toCharArray());
if (q < 0) q = CharBuffer.indexOf(newtext, p + pl, " W".toCharArray());
if (q < 0 && newtext.length - p == 7 + pl) q = newtext.length;
if (q < 0 && newtext.length - p == 7 + pl) q = newtext.length;
if (q < 0) break location;
int r = p;
while (r-- > 1) {
@ -254,22 +254,22 @@ public class ContentScraper extends AbstractScraper implements Scraper {
s = p + 6;
try {
url = new MultiProtocolURI(u);
anchors.put(url, new Properties());
this.anchors.put(url, new Properties());
continue;
} catch (MalformedURLException e) {}
} catch (final MalformedURLException e) {}
}
// append string to content
if (b.length() != 0) content.append(b).append(32);
if (b.length() != 0) this.content.append(b).append(32);
}
private static final int find(final String s, final String m, final int start) {
final int p = s.indexOf(m, start);
return (p < 0) ? Integer.MAX_VALUE : p;
}
private MultiProtocolURI absolutePath(final String relativePath) {
try {
return MultiProtocolURI.newURL(root, relativePath);
return MultiProtocolURI.newURL(this.root, relativePath);
} catch (final Exception e) {
return null;
}
@ -277,7 +277,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public void scrapeTag0(final String tagname, final Properties tagopts) {
if (tagname.equalsIgnoreCase("img")) {
String src = tagopts.getProperty("src", "");
final String src = tagopts.getProperty("src", "");
try {
final int width = Integer.parseInt(tagopts.getProperty("width", "-1"));
final int height = Integer.parseInt(tagopts.getProperty("height", "-1"));
@ -285,48 +285,48 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final MultiProtocolURI url = absolutePath(src);
if (url != null) {
final ImageEntry ie = new ImageEntry(url, tagopts.getProperty("alt", ""), width, height, -1);
addImage(images, ie);
addImage(this.images, ie);
}
}
} catch (final NumberFormatException e) {}
this.evaluationScores.match(Element.imgpath, src);
} else if(tagname.equalsIgnoreCase("base")) {
try {
root = new MultiProtocolURI(tagopts.getProperty("href", ""));
this.root = new MultiProtocolURI(tagopts.getProperty("href", ""));
} catch (final MalformedURLException e) {}
} else if (tagname.equalsIgnoreCase("frame")) {
MultiProtocolURI src = absolutePath(tagopts.getProperty("src", ""));
anchors.put(src, tagopts /* with property "name" */);
frames.add(src);
final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", ""));
this.anchors.put(src, tagopts /* with property "name" */);
this.frames.add(src);
this.evaluationScores.match(Element.framepath, src.toNormalform(true, false));
} else if (tagname.equalsIgnoreCase("body")) {
String c = tagopts.getProperty("class", "");
final String c = tagopts.getProperty("class", "");
this.evaluationScores.match(Element.bodyclass, c);
} else if (tagname.equalsIgnoreCase("div")) {
String id = tagopts.getProperty("id", "");
final String id = tagopts.getProperty("id", "");
this.evaluationScores.match(Element.divid, id);
} else if (tagname.equalsIgnoreCase("meta")) {
String name = tagopts.getProperty("name", "");
String content = tagopts.getProperty("content","");
final String content = tagopts.getProperty("content","");
if (name.length() > 0) {
metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
if (name.equals("generator")) {
this.evaluationScores.match(Element.metagenerator, content);
}
} else {
name = tagopts.getProperty("http-equiv", "");
if (name.length() > 0) {
metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
this.metas.put(name.toLowerCase(), CharacterCoding.html2unicode(content));
}
}
} else if (tagname.equalsIgnoreCase("area")) {
final String areatitle = cleanLine(tagopts.getProperty("title",""));
//String alt = tagopts.getProperty("alt","");
final String href = tagopts.getProperty("href", "");
Properties p = new Properties(); p.put("name", areatitle);
if (href.length() > 0) anchors.put(absolutePath(href), p);
final Properties p = new Properties(); p.put("name", areatitle);
if (href.length() > 0) this.anchors.put(absolutePath(href), p);
} else if (tagname.equalsIgnoreCase("link")) {
String href = tagopts.getProperty("href", "");
final String href = tagopts.getProperty("href", "");
final MultiProtocolURI newLink = absolutePath(href);
if (newLink != null) {
@ -336,31 +336,31 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (rel.equalsIgnoreCase("shortcut icon")) {
final ImageEntry ie = new ImageEntry(newLink, linktitle, -1, -1, -1);
images.put(ie.url(), ie);
this.images.put(ie.url(), ie);
this.favicon = newLink;
} else if (rel.equalsIgnoreCase("alternate") && type.equalsIgnoreCase("application/rss+xml")) {
rss.put(newLink, linktitle);
this.rss.put(newLink, linktitle);
} else if (rel.equalsIgnoreCase("stylesheet") && type.equalsIgnoreCase("text/css")) {
css.put(newLink, rel);
this.css.put(newLink, rel);
this.evaluationScores.match(Element.csspath, href);
} else if (!rel.equalsIgnoreCase("stylesheet") && !rel.equalsIgnoreCase("alternate stylesheet")) {
Properties p = new Properties(); p.put("name", linktitle);
anchors.put(newLink, p);
final Properties p = new Properties(); p.put("name", linktitle);
this.anchors.put(newLink, p);
}
}
} else if(tagname.equalsIgnoreCase("embed")) {
anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts /* with property "name" */);
this.anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts /* with property "name" */);
} else if(tagname.equalsIgnoreCase("param")) {
final String name = tagopts.getProperty("name", "");
if (name.equalsIgnoreCase("movie")) {
anchors.put(absolutePath(tagopts.getProperty("value", "")), tagopts /* with property "name" */);
this.anchors.put(absolutePath(tagopts.getProperty("value", "")), tagopts /* with property "name" */);
}
}
// fire event
fireScrapeTag0(tagname, tagopts);
}
public void scrapeTag1(final String tagname, final Properties tagopts, final char[] text) {
// System.out.println("ScrapeTag1: tagname=" + tagname + ", opts=" + tagopts.toString() + ", text=" + UTF8.String(text));
if (tagname.equalsIgnoreCase("a") && text.length < 2048) {
@ -373,10 +373,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (type.equals("png") || type.equals("gif") || type.equals("jpg") || type.equals("jpeg") || type.equals("tiff") || type.equals("tif")) {
// special handling of such urls: put them to the image urls
final ImageEntry ie = new ImageEntry(url, recursiveParse(text), -1, -1, -1);
addImage(images, ie);
addImage(this.images, ie);
} else {
tagopts.put("name", recursiveParse(text));
anchors.put(url, tagopts);
this.anchors.put(url, tagopts);
}
}
this.evaluationScores.match(Element.apath, href);
@ -384,45 +384,45 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final String h;
if ((tagname.equalsIgnoreCase("h1")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) headlines[0].add(h);
if (h.length() > 0) this.headlines[0].add(h);
} else if((tagname.equalsIgnoreCase("h2")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) headlines[1].add(h);
if (h.length() > 0) this.headlines[1].add(h);
} else if ((tagname.equalsIgnoreCase("h3")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) headlines[2].add(h);
if (h.length() > 0) this.headlines[2].add(h);
} else if ((tagname.equalsIgnoreCase("h4")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) headlines[3].add(h);
if (h.length() > 0) this.headlines[3].add(h);
} else if ((tagname.equalsIgnoreCase("h5")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) headlines[4].add(h);
if (h.length() > 0) this.headlines[4].add(h);
} else if ((tagname.equalsIgnoreCase("h6")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) headlines[5].add(h);
if (h.length() > 0) this.headlines[5].add(h);
} else if ((tagname.equalsIgnoreCase("title")) && (text.length < 1024)) {
title = recursiveParse(text);
this.title = recursiveParse(text);
} else if ((tagname.equalsIgnoreCase("b")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) bold.inc(h);
if (h.length() > 0) this.bold.inc(h);
} else if ((tagname.equalsIgnoreCase("strong")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) bold.inc(h);
if (h.length() > 0) this.bold.inc(h);
} else if ((tagname.equalsIgnoreCase("i")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) italic.inc(h);
if (h.length() > 0) this.italic.inc(h);
} else if ((tagname.equalsIgnoreCase("li")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) li.add(h);
if (h.length() > 0) this.li.add(h);
} else if (tagname.equalsIgnoreCase("iframe")) {
MultiProtocolURI src = absolutePath(tagopts.getProperty("src", ""));
anchors.put(src, tagopts /* with property "name" */);
iframes.add(src);
final MultiProtocolURI src = absolutePath(tagopts.getProperty("src", ""));
this.anchors.put(src, tagopts /* with property "name" */);
this.iframes.add(src);
this.evaluationScores.match(Element.iframepath, src.toNormalform(true, false));
} else if (tagname.equalsIgnoreCase("script")) {
String src = tagopts.getProperty("src", "");
final String src = tagopts.getProperty("src", "");
if (src.length() > 0) {
script.add(absolutePath(src));
this.script.add(absolutePath(src));
this.evaluationScores.match(Element.scriptpath, src);
} else {
this.evaluationScores.match(Element.scriptcode, text);
@ -432,7 +432,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// fire event
fireScrapeTag1(tagname, tagopts, text);
}
public void scrapeComment(final char[] comment) {
this.evaluationScores.match(Element.comment, comment);
@ -440,28 +440,30 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private String recursiveParse(final char[] inlineHtml) {
if (inlineHtml.length < 14) return cleanLine(super.stripAll(inlineHtml));
// start a new scraper to parse links inside this text
// parsing the content
final ContentScraper scraper = new ContentScraper(this.root);
final ContentScraper scraper = new ContentScraper(this.root);
final TransformerWriter writer = new TransformerWriter(null, null, scraper, null, false);
try {
FileUtils.copy(new CharArrayReader(inlineHtml), writer);
writer.close();
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
return cleanLine(super.stripAll(inlineHtml));
}
this.anchors.putAll(scraper.getAnchors());
this.images.putAll(scraper.images);
return cleanLine(super.stripAll(scraper.content.getChars()));
}
private final static String cleanLine(final String s) {
final StringBuilder sb = new StringBuilder(s.length());
char l = ' ';
for (char c : s.toCharArray()) {
char c;
for (int i = 0; i < s.length(); i++) {
c = s.charAt(i);
if (c < ' ') c = ' ';
if (c == ' ') {
if (l != ' ') sb.append(c);
@ -470,91 +472,91 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
l = c;
}
// return result
return sb.toString().trim();
}
public String getTitle() {
// construct a title string, even if the document has no title
// some documents have a title tag as meta tag
String s = metas.get("title");
String s = this.metas.get("title");
// try to construct the title with the content of the title tag
if (title.length() > 0) {
if (this.title.length() > 0) {
if (s == null) {
return title;
return this.title;
}
if ((title.compareToIgnoreCase(s) == 0) || (title.indexOf(s) >= 0)) return s;
return title + ": " + s;
if ((this.title.compareToIgnoreCase(s) == 0) || (this.title.indexOf(s) >= 0)) return s;
return this.title + ": " + s;
}
if (s != null) {
return s;
}
// otherwise take any headline
for (int i = 0; i < this.headlines.length; i++) {
if (!this.headlines[i].isEmpty()) return this.headlines[i].get(0);
}
// take description tag
s = getDescription();
if (s.length() > 0) return s;
// extract headline from file name
return MultiProtocolURI.unescape(root.getFileName());
return MultiProtocolURI.unescape(this.root.getFileName());
}
public String[] getHeadlines(final int i) {
assert ((i >= 1) && (i <= this.headlines.length));
return this.headlines[i - 1].toArray(new String[this.headlines[i - 1].size()]);
}
public String[] getBold() {
List<String> a = new ArrayList<String>();
Iterator<String> i = this.bold.keys(false);
while (i.hasNext()) a.add(i.next());
final List<String> a = new ArrayList<String>();
final Iterator<String> i = this.bold.keys(false);
while (i.hasNext()) a.add(i.next());
return a.toArray(new String[a.size()]);
}
public String[] getBoldCount(String[] a) {
String[] counter = new String[a.length];
public String[] getBoldCount(final String[] a) {
final String[] counter = new String[a.length];
for (int i = 0; i < a.length; i++) counter[i] = Integer.toString(this.bold.get(a[i]));
return counter;
}
public String[] getItalic() {
List<String> a = new ArrayList<String>();
Iterator<String> i = this.italic.keys(false);
while (i.hasNext()) a.add(i.next());
final List<String> a = new ArrayList<String>();
final Iterator<String> i = this.italic.keys(false);
while (i.hasNext()) a.add(i.next());
return a.toArray(new String[a.size()]);
}
public String[] getItalicCount(String[] a) {
String[] counter = new String[a.length];
public String[] getItalicCount(final String[] a) {
final String[] counter = new String[a.length];
for (int i = 0; i < a.length; i++) counter[i] = Integer.toString(this.italic.get(a[i]));
return counter;
}
public String[] getLi() {
return this.li.toArray(new String[this.li.size()]);
}
public boolean containsFlash() {
this.anchors = new HashMap<MultiProtocolURI, Properties>();
String ext;
for (MultiProtocolURI url: this.anchors.keySet()) {
for (final MultiProtocolURI url: this.anchors.keySet()) {
ext = url.getFileExtension();
if (ext == null) continue;
if (ext.equals("swf")) return true;
}
return false;
}
public byte[] getText() {
try {
return content.getBytes();
return this.content.getBytes();
} catch (final OutOfMemoryError e) {
Log.logException(e);
return new byte[0];
@ -563,31 +565,31 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public Map<MultiProtocolURI, Properties> getAnchors() {
// returns a url (String) / name (String) relation
return anchors;
return this.anchors;
}
public Map<MultiProtocolURI, String> getRSS() {
// returns a url (String) / name (String) relation
return rss;
return this.rss;
}
public Map<MultiProtocolURI, String> getCSS() {
// returns a url (String) / name (String) relation
return css;
return this.css;
}
public Set<MultiProtocolURI> getFrames() {
// returns a url (String) / name (String) relation
return frames;
return this.frames;
}
public Set<MultiProtocolURI> getIFrames() {
// returns a url (String) / name (String) relation
return iframes;
return this.iframes;
}
public Set<MultiProtocolURI> getScript() {
return script;
return this.script;
}
/**
@ -596,16 +598,16 @@ public class ContentScraper extends AbstractScraper implements Scraper {
*/
public Map<MultiProtocolURI, ImageEntry> getImages() {
// this resturns a String(absolute url)/htmlFilterImageEntry - relation
return images;
return this.images;
}
public Map<String, String> getMetas() {
return metas;
return this.metas;
}
/**
* @return the {@link MultiProtocolURI} to the favicon that belongs to the document
*/
*/
public MultiProtocolURI getFavicon() {
return this.favicon;
}
@ -618,52 +620,52 @@ public class ContentScraper extends AbstractScraper implements Scraper {
<meta name="DC.format" scheme="DCTERMS.IMT" content="text/html" />
<meta name="DC.type" scheme="DCTERMS.DCMIType" content="Text" />
*/
public boolean indexingDenied() {
String s = metas.get("robots");
final String s = this.metas.get("robots");
if (s == null) return false;
if (s.indexOf("noindex") >= 0) return true;
return false;
}
public String getDescription() {
String s = metas.get("description");
if (s == null) s = metas.get("dc.description");
String s = this.metas.get("description");
if (s == null) s = this.metas.get("dc.description");
if (s == null) return "";
return s;
}
public String getContentType() {
final String s = metas.get("content-type");
final String s = this.metas.get("content-type");
if (s == null) return "";
return s;
}
public String getAuthor() {
String s = metas.get("author");
if (s == null) s = metas.get("dc.creator");
String s = this.metas.get("author");
if (s == null) s = this.metas.get("dc.creator");
if (s == null) return "";
return s;
}
public String getPublisher() {
String s = metas.get("copyright");
if (s == null) s = metas.get("dc.publisher");
String s = this.metas.get("copyright");
if (s == null) s = this.metas.get("dc.publisher");
if (s == null) return "";
return s;
}
private final static Pattern commaSepPattern = Pattern.compile(" |,");
private final static Pattern semicSepPattern = Pattern.compile(" |;");
public Set<String> getContentLanguages() {
// i.e. <meta name="DC.language" content="en" scheme="DCTERMS.RFC3066">
// or <meta http-equiv="content-language" content="en">
String s = metas.get("content-language");
if (s == null) s = metas.get("dc.language");
String s = this.metas.get("content-language");
if (s == null) s = this.metas.get("dc.language");
if (s == null) return null;
Set<String> hs = new HashSet<String>();
String[] cl = commaSepPattern.split(s);
final Set<String> hs = new HashSet<String>();
final String[] cl = commaSepPattern.split(s);
int p;
for (int i = 0; i < cl.length; i++) {
cl[i] = cl[i].toLowerCase();
@ -674,10 +676,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (hs.isEmpty()) return null;
return hs;
}
public String[] getKeywords() {
String s = metas.get("keywords");
if (s == null) s = metas.get("dc.description");
String s = this.metas.get("keywords");
if (s == null) s = this.metas.get("dc.description");
if (s == null) s = "";
if (s.length() == 0) {
return MultiProtocolURI.splitpattern.split(getTitle().toLowerCase());
@ -686,9 +688,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if (s.contains(";")) return semicSepPattern.split(s);
return s.split("\\s");
}
public int getRefreshSeconds() {
final String s = metas.get("refresh");
final String s = this.metas.get("refresh");
if (s == null) return 9999;
try {
final int pos = s.indexOf(';');
@ -701,9 +703,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
public String getRefreshPath() {
String s = metas.get("refresh");
String s = this.metas.get("refresh");
if (s == null) return "";
final int pos = s.indexOf(';');
if (pos < 0) return "";
s = s.substring(pos + 1);
@ -714,10 +716,10 @@ public class ContentScraper extends AbstractScraper implements Scraper {
// parse location
// <meta NAME="ICBM" CONTENT="38.90551492, 1.454004505" />
// <meta NAME="geo.position" CONTENT="38.90551492;1.454004505" />
public float getLon() {
if (this.lon != 0.0f) return this.lon;
String s = metas.get("ICBM"); // InterContinental Ballistic Missile (abbrev. supposed to be a joke: http://www.jargon.net/jargonfile/i/ICBMaddress.html), see http://geourl.org/add.html#icbm
String s = this.metas.get("ICBM"); // InterContinental Ballistic Missile (abbrev. supposed to be a joke: http://www.jargon.net/jargonfile/i/ICBMaddress.html), see http://geourl.org/add.html#icbm
if (s != null) {
int p = s.indexOf(';');
if (p < 0) p = s.indexOf(',');
@ -728,7 +730,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
}
if (this.lon != 0.0f) return this.lon;
s = metas.get("geo.position"); // http://geotags.com/geobot/add-tags.html
s = this.metas.get("geo.position"); // http://geotags.com/geobot/add-tags.html
if (s != null) {
int p = s.indexOf(';');
if (p < 0) p = s.indexOf(',');
@ -740,13 +742,13 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
return this.lon;
}
public float getLat() {
if (this.lat != 0.0f) return this.lat;
getLon(); // parse with getLon() method which creates also the lat value
return this.lat;
}
/**
* produce all model names
* @return a set of model names
@ -754,26 +756,26 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public Set<String> getEvaluationModelNames() {
return this.evaluationScores.getModelNames();
}
public String[] getEvaluationModelScoreNames(String modelName) {
List<String> a = new ArrayList<String>();
ClusteredScoreMap<String> scores = this.evaluationScores.getScores(modelName);
public String[] getEvaluationModelScoreNames(final String modelName) {
final List<String> a = new ArrayList<String>();
final ClusteredScoreMap<String> scores = this.evaluationScores.getScores(modelName);
if (scores != null) {
Iterator<String> i = scores.keys(false);
final Iterator<String> i = scores.keys(false);
while (i.hasNext()) a.add(i.next());
}
return a.toArray(new String[a.size()]);
}
public String[] getEvaluationModelScoreCounts(String modelName, String[] a) {
ClusteredScoreMap<String> scores = this.evaluationScores.getScores(modelName);
String[] counter = new String[a.length];
public String[] getEvaluationModelScoreCounts(final String modelName, final String[] a) {
final ClusteredScoreMap<String> scores = this.evaluationScores.getScores(modelName);
final String[] counter = new String[a.length];
if (scores != null) {
for (int i = 0; i < a.length; i++) counter[i] = Integer.toString(scores.get(a[i]));
}
return counter;
}
/*
* (non-Javadoc)
* @see de.anomic.htmlFilter.htmlFilterScraper#close()
@ -782,37 +784,37 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public void close() {
// free resources
super.close();
anchors = null;
images = null;
title = null;
headlines = null;
content = null;
root = null;
this.anchors = null;
this.images = null;
this.title = null;
this.headlines = null;
this.content = null;
this.root = null;
}
public void print() {
System.out.println("TITLE :" + title);
System.out.println("TITLE :" + this.title);
for (int i = 0; i < 4; i++) {
System.out.println("HEADLINE" + i + ":" + headlines[i].toString());
System.out.println("HEADLINE" + i + ":" + this.headlines[i].toString());
}
System.out.println("ANCHORS :" + anchors.toString());
System.out.println("IMAGES :" + images.toString());
System.out.println("METAS :" + metas.toString());
System.out.println("TEXT :" + content.toString());
System.out.println("ANCHORS :" + this.anchors.toString());
System.out.println("IMAGES :" + this.images.toString());
System.out.println("METAS :" + this.metas.toString());
System.out.println("TEXT :" + this.content.toString());
}
public void registerHtmlFilterEventListener(final ScraperListener listener) {
if (listener != null) {
this.htmlFilterEventListeners.add(ScraperListener.class, listener);
}
}
}
public void deregisterHtmlFilterEventListener(final ScraperListener listener) {
if (listener != null) {
this.htmlFilterEventListeners.remove(ScraperListener.class, listener);
}
}
}
private void fireScrapeTag0(final String tagname, final Properties tagopts) {
final Object[] listeners = this.htmlFilterEventListeners.getListenerList();
for (int i=0; i<listeners.length; i+=2) {
@ -820,8 +822,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
((ScraperListener)listeners[i+1]).scrapeTag0(tagname, tagopts);
}
}
}
}
private void fireScrapeTag1(final String tagname, final Properties tagopts, final char[] text) {
final Object[] listeners = this.htmlFilterEventListeners.getListenerList();
for (int i=0; i<listeners.length; i+=2) {
@ -830,26 +832,26 @@ public class ContentScraper extends AbstractScraper implements Scraper {
}
}
}
public static ContentScraper parseResource(final File file) throws IOException {
// load page
final byte[] page = FileUtils.read(file);
if (page == null) throw new IOException("no content in file " + file.toString());
// scrape document to look up charset
final ScraperInputStream htmlFilter = new ScraperInputStream(new ByteArrayInputStream(page),"UTF-8", new MultiProtocolURI("http://localhost"),null,false);
String charset = htmlParser.patchCharsetEncoding(htmlFilter.detectCharset());
if(charset == null)
charset = Charset.defaultCharset().toString();
// scrape content
final ContentScraper scraper = new ContentScraper(new MultiProtocolURI("http://localhost"));
final Writer writer = new TransformerWriter(null, null, scraper, null, false);
FileUtils.copy(new ByteArrayInputStream(page), writer, Charset.forName(charset));
return scraper;
}
public static void addAllImages(final Map<MultiProtocolURI, ImageEntry> a, final Map<MultiProtocolURI, ImageEntry> b) {
final Iterator<Map.Entry<MultiProtocolURI, ImageEntry>> i = b.entrySet().iterator();
Map.Entry<MultiProtocolURI, ImageEntry> ie;
@ -858,7 +860,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
addImage(a, ie.getValue());
}
}
public static void addImage(final Map<MultiProtocolURI, ImageEntry> a, final ImageEntry ie) {
if (a.containsKey(ie.url())) {
// in case of a collision, take that image that has the better image size tags
@ -867,6 +869,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
a.put(ie.url(), ie);
}
}
}

@ -11,12 +11,12 @@
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
@ -38,6 +38,7 @@ import java.util.regex.PatternSyntaxException;
import net.yacy.cora.storage.ClusteredScoreMap;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.MemoryControl;
/*
@ -58,7 +59,7 @@ import net.yacy.kelondro.logging.Log;
public class Evaluation {
private static List<Model> models = new ArrayList<Model>(); // the list of all models that shall be applied
public static enum Element {
text,
bodyclass,
@ -74,42 +75,42 @@ public class Evaluation {
apath,
comment;
}
private static class Attribute {
public String subject; // the name of the attribute
public Pattern pattern; // the pattern that must match for that attribute
public Attribute(String subject, Pattern pattern) {
public Attribute(final String subject, final Pattern pattern) {
this.subject = subject;
this.pattern = pattern;
}
}
private static class Model {
private String modelName;
private Map<Element, List<Attribute>> elementMatcher; // a mapping from element-names to lists of Attributes
public Model(File patternProperties) throws IOException {
private final String modelName;
private final Map<Element, List<Attribute>> elementMatcher; // a mapping from element-names to lists of Attributes
public Model(final File patternProperties) throws IOException {
if (!patternProperties.exists()) throw new IOException("File does not exist: " + patternProperties);
String name = patternProperties.getName();
final String name = patternProperties.getName();
if (!name.startsWith("parser.")) throw new IOException("file name must start with 'parser.': " + name);
if (!name.endsWith(".properties")) throw new IOException("file name must end with '.properties': " + name);
this.modelName = name.substring(7, name.length() - 11);
if (this.modelName.length() < 1) throw new IOException("file name too short: " + name);
// load the file
Properties p = new Properties();
final Properties p = new Properties();
p.load(new FileReader(patternProperties));
// iterate through the properties and generate method patterns
elementMatcher = new HashMap<Element, List<Attribute>>();
this.elementMatcher = new HashMap<Element, List<Attribute>>();
String subject, elementName;
Element element;
Pattern pattern;
for (Map.Entry<Object, Object> entry: p.entrySet()) {
String k = (String) entry.getKey();
String v = (String) entry.getValue();
int w = k.indexOf('_');
for (final Map.Entry<Object, Object> entry: p.entrySet()) {
final String k = (String) entry.getKey();
final String v = (String) entry.getValue();
final int w = k.indexOf('_');
if (w < 0) {
Log.logSevere("PatternAnalysis", "wrong configuration in " + name + ": separator '_' missing: " + k);
continue;
@ -118,7 +119,7 @@ public class Evaluation {
elementName = k.substring(w + 1);
try {
pattern = Pattern.compile(v);
} catch (PatternSyntaxException e) {
} catch (final PatternSyntaxException e) {
Log.logSevere("PatternAnalysis", "bad pattern in " + name + ": '" + k + "=" + v + "' - " + e.getDescription());
continue;
}
@ -135,35 +136,35 @@ public class Evaluation {
attributeList.add(new Attribute(subject, pattern));
}
}
public String getName() {
return this.modelName;
}
/**
* match elementContents for a specific elementName
* @param element - the name of the element as Element enum type
* @param content - the content of the element
* @return a list of subject names that match with the element
*/
public ClusteredScoreMap<String> match(Element element, String content) {
ClusteredScoreMap<String> subjects = new ClusteredScoreMap<String>();
List<Attribute> patterns = this.elementMatcher.get(element);
public ClusteredScoreMap<String> match(final Element element, final CharSequence content) {
final ClusteredScoreMap<String> subjects = new ClusteredScoreMap<String>();
final List<Attribute> patterns = this.elementMatcher.get(element);
if (patterns == null) return subjects;
for (Attribute attribute: patterns) {
for (final Attribute attribute: patterns) {
if (attribute.pattern.matcher(content).matches()) subjects.inc(attribute.subject);
}
return subjects;
}
}
private final Map<String, ClusteredScoreMap<String>> modelMap; // a map from model names to attribute scores
public Evaluation() {
this.modelMap = new HashMap<String, ClusteredScoreMap<String>>();
}
/**
* produce all model names
* @return a set of model names
@ -171,14 +172,14 @@ public class Evaluation {
public Set<String> getModelNames() {
return this.modelMap.keySet();
}
/**
* calculate the scores for a model
* the scores is a attribute/count map which count how often a specific attribute was found
* @param modelName
* @return
*/
public ClusteredScoreMap<String> getScores(String modelName) {
public ClusteredScoreMap<String> getScores(final String modelName) {
return this.modelMap.get(modelName);
}
@ -187,23 +188,23 @@ public class Evaluation {
* @param f
* @throws IOException
*/
public static void add(File f) throws IOException {
Model pattern = new Model(f);
public static void add(final File f) throws IOException {
final Model pattern = new Model(f);
models.add(pattern);
}
/**
* match some content within a specific element
* this will increase statistic counters for models if a model matches
* @param element - the element where a matching is made
* @param content - the content of the element which shall be matched
*/
public void match(Element element, String content) {
public void match(final Element element, final CharSequence content) {
if (models.isEmpty()) return; // fast return if this feature is not used
ClusteredScoreMap<String> newScores, oldScores;
for (Model pattern: models) {
for (final Model pattern: models) {
newScores = pattern.match(element, content);
oldScores = this.getScores(pattern.getName());
oldScores = getScores(pattern.getName());
if (oldScores == null) {
oldScores = new ClusteredScoreMap<String>();
this.modelMap.put(pattern.getName(), oldScores);
@ -211,10 +212,12 @@ public class Evaluation {
oldScores.inc(newScores);
}
}
public void match(Element element, char[] content) {
public void match(final Element element, final char[] content) {
if (models.isEmpty()) return; // fast return if this feature is not used
match(element, new String(content));
if (MemoryControl.request(content.length * 2, false)) {
match(element, new String(content) /*Segment(content, 0, content.length)*/);
}
}
}

@ -96,10 +96,10 @@ public final class TransformerWriter extends Writer {
this.inStyle = false;
this.binaryUnsuspect = true;
this.passbyIfBinarySuspect = passbyIfBinarySuspect;
if (this.outStream != null) {
this.out = new OutputStreamWriter(this.outStream,(charSet == null)?Charset.defaultCharset():charSet);
}
}
}
public static char[] genTag0raw(final String tagname, final boolean opening, final char[] tagopts) {
@ -118,7 +118,7 @@ public final class TransformerWriter extends Writer {
final char[] result = bb.getChars();
try {
bb.close();
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
}
return result;
@ -138,13 +138,13 @@ public final class TransformerWriter extends Writer {
final char[] result = bb.getChars();
try {
bb.close();
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
}
return result;
}
public static char[] genTag0(final String tagname, final Properties tagopts, final char quotechar) {
public static char[] genTag0(final String tagname, final Properties tagopts, final char quotechar) {
final char[] tagoptsx = (tagopts.isEmpty()) ? null : genOpts(tagopts, quotechar);
final CharBuffer bb = new CharBuffer(tagname.length() + ((tagoptsx == null) ? 0 : (tagoptsx.length + 1)) + tagname.length() + 2);
bb.append((int)'<').append(tagname);
@ -156,27 +156,27 @@ public final class TransformerWriter extends Writer {
final char[] result = bb.getChars();
try {
bb.close();
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
}
return result;
}
public static char[] genTag1(final String tagname, final Properties tagopts, final char[] text, final char quotechar) {
public static char[] genTag1(final String tagname, final Properties tagopts, final char[] text, final char quotechar) {
final char[] gt0 = genTag0(tagname, tagopts, quotechar);
final CharBuffer cb = new CharBuffer(gt0, gt0.length + text.length + tagname.length() + 3);
cb.append(text).append((int)'<').append((int)'/').append(tagname).append((int)'>');
final char[] result = cb.getChars();
try {
cb.close();
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
}
return result;
}
// a helper method for pretty-printing of properties for html tags
public static char[] genOpts(final Properties prop, final char quotechar) {
public static char[] genOpts(final Properties prop, final char quotechar) {
final Enumeration<?> e = prop.propertyNames();
final CharBuffer bb = new CharBuffer(prop.size() * 40);
String key;
@ -184,7 +184,7 @@ public final class TransformerWriter extends Writer {
key = (String) e.nextElement();
bb.append(32).append(key).append((int)'=').append((int)quotechar);
bb.append(prop.getProperty(key));
bb.append((int)quotechar);
bb.append((int)quotechar);
}
final char[] result;
if (bb.length() > 0)
@ -193,7 +193,7 @@ public final class TransformerWriter extends Writer {
result = bb.getChars();
try {
bb.close();
} catch (IOException ex) {
} catch (final IOException ex) {
Log.logException(ex);
}
return result;
@ -201,113 +201,115 @@ public final class TransformerWriter extends Writer {
private char[] filterTag(final String tag, final boolean opening, final char[] content, final char quotechar) {
// System.out.println("FILTER1: filterTag=" + ((filterTag == null) ? "null" : filterTag) + ", tag=" + tag + ", opening=" + ((opening) ? "true" : "false") + ", content=" + UTF8.String(content)); // debug
if (filterTag == null) {
if (this.filterTag == null) {
// we are not collection tag text
if (tag == null) {
// and this is not a tag opener/closer
if (scraper != null) scraper.scrapeText(content, null);
if (transformer != null) return transformer.transformText(content);
if (this.scraper != null) this.scraper.scrapeText(content, null);
if (this.transformer != null) return this.transformer.transformText(content);
return content;
}
// we have a new tag
if (opening) {
if ((scraper != null) && (scraper.isTag0(tag))) {
if ((this.scraper != null) && (this.scraper.isTag0(tag))) {
// this single tag is collected at once here
final CharBuffer charBuffer = new CharBuffer(content);
scraper.scrapeTag0(tag, charBuffer.propParser());
this.scraper.scrapeTag0(tag, charBuffer.propParser());
try {
charBuffer.close();
} catch (IOException e) {
} catch (final IOException e) {
// TODO Auto-generated catch block
Log.logException(e);
}
}
if ((transformer != null) && (transformer.isTag0(tag))) {
if ((this.transformer != null) && (this.transformer.isTag0(tag))) {
// this single tag is collected at once here
final CharBuffer scb = new CharBuffer(content);
try {
return transformer.transformTag0(tag, scb.propParser(), quotechar);
return this.transformer.transformTag0(tag, scb.propParser(), quotechar);
} finally {
try {
scb.close();
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
}
}
} else if (((scraper != null) && (scraper.isTag1(tag))) ||
((transformer != null) && (transformer.isTag1(tag)))) {
} else if (((this.scraper != null) && (this.scraper.isTag1(tag))) ||
((this.transformer != null) && (this.transformer.isTag1(tag)))) {
// ok, start collecting
filterTag = tag;
this.filterTag = tag;
final CharBuffer scb = new CharBuffer(content);
filterOpts = scb.propParser();
this.filterOpts = scb.propParser();
try {
scb.close();
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
}
if (filterCont == null) filterCont = new CharBuffer(Math.max(100, content.length)); else filterCont.reset();
if (this.filterCont == null) this.filterCont = new CharBuffer(Math.max(100, content.length)); else this.filterCont.reset();
return new char[0];
} else {
// we ignore that thing and return it again
return genTag0raw(tag, true, content);
}
}
// we ignore that thing and return it again
return genTag0raw(tag, false, content);
}
// we are collection tag text for the tag 'filterTag'
if (tag == null) {
// go on collecting content
if (scraper != null) scraper.scrapeText(content, filterTag);
if (transformer != null) {
filterCont.append(transformer.transformText(content));
} else {
filterCont.append(content);
}
if (this.scraper != null) this.scraper.scrapeText(content, this.filterTag);
try {
if (this.transformer != null) {
this.filterCont.append(this.transformer.transformText(content));
} else {
this.filterCont.append(content);
}
} catch (final OutOfMemoryError e) {}
return new char[0];
}
// it's a tag! which one?
if ((opening) || (!(tag.equalsIgnoreCase(filterTag)))) {
if ((opening) || (!(tag.equalsIgnoreCase(this.filterTag)))) {
// this tag is not our concern. just add it
filterCont.append(genTag0raw(tag, opening, content));
this.filterCont.append(genTag0raw(tag, opening, content));
return new char[0];
}
// it's our closing tag! return complete result.
char[] ret;
if (scraper != null) scraper.scrapeTag1(filterTag, filterOpts, filterCont.getChars());
if (transformer != null) {
ret = transformer.transformTag1(filterTag, filterOpts, filterCont.getChars(), quotechar);
if (this.scraper != null) this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars());
if (this.transformer != null) {
ret = this.transformer.transformTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
} else {
ret = genTag1(filterTag, filterOpts, filterCont.getChars(), quotechar);
ret = genTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
}
filterTag = null;
filterOpts = null;
filterCont = null;
this.filterTag = null;
this.filterOpts = null;
this.filterCont = null;
return ret;
}
private char[] filterFinalize(final char quotechar) {
if (filterTag == null) {
if (this.filterTag == null) {
return new char[0];
}
// it's our closing tag! return complete result.
char[] ret;
if (scraper != null) scraper.scrapeTag1(filterTag, filterOpts, filterCont.getChars());
if (transformer != null) {
ret = transformer.transformTag1(filterTag, filterOpts, filterCont.getChars(), quotechar);
if (this.scraper != null) this.scraper.scrapeTag1(this.filterTag, this.filterOpts, this.filterCont.getChars());
if (this.transformer != null) {
ret = this.transformer.transformTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
} else {
ret = genTag1(filterTag, filterOpts, filterCont.getChars(), quotechar);
ret = genTag1(this.filterTag, this.filterOpts, this.filterCont.getChars(), quotechar);
}
filterTag = null;
filterOpts = null;
filterCont = null;
this.filterTag = null;
this.filterOpts = null;
this.filterCont = null;
return ret;
}
@ -316,27 +318,27 @@ public final class TransformerWriter extends Writer {
// System.out.println("FILTER0: " + UTF8.String(in)); // debug
// scan the string and parse structure
if (in.length > 2 && in[0] == lb) {
// a tag
String tag;
int tagend;
if (in[1] == '/') {
// a closing tag
tagend = tagEnd(in, 2);
tag = new String(in, 2, tagend - 2);
tag = new String(in, 2, tagend - 2);
final char[] text = new char[in.length - tagend - 1];
System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
return filterTag(tag, false, text, quotechar);
}
// an opening tag
tagend = tagEnd(in, 1);
tag = new String(in, 1, tagend - 1);
tag = new String(in, 1, tagend - 1);
final char[] text = new char[in.length - tagend - 1];
System.arraycopy(in, tagend, text, 0, in.length - tagend - 1);
return filterTag(tag, true, text, quotechar);
}
// a text
return filterTag(null, true, in, quotechar);
}
@ -357,160 +359,160 @@ public final class TransformerWriter extends Writer {
@Override
public void write(final int c) throws IOException {
// System.out.println((char) c);
if ((binaryUnsuspect) && (binaryHint((char)c))) {
binaryUnsuspect = false;
if (passbyIfBinarySuspect) close();
if ((this.binaryUnsuspect) && (binaryHint((char)c))) {
this.binaryUnsuspect = false;
if (this.passbyIfBinarySuspect) close();
}
if (binaryUnsuspect || !passbyIfBinarySuspect) {
if (this.binaryUnsuspect || !this.passbyIfBinarySuspect) {
char[] filtered;
if (inSingleQuote) {
buffer.append(c);
if (c == singlequote) inSingleQuote = false;
if (this.inSingleQuote) {
this.buffer.append(c);
if (c == singlequote) this.inSingleQuote = false;
// check error cases
if ((c == rb) && (buffer.length() > 0 && buffer.charAt(0) == lb)) {
inSingleQuote = false;
if ((c == rb) && (this.buffer.length() > 0 && this.buffer.charAt(0) == lb)) {
this.inSingleQuote = false;
// the tag ends here. after filtering: pass on
filtered = filterSentence(buffer.getChars(), singlequote);
if (out != null) { out.write(filtered); }
filtered = filterSentence(this.buffer.getChars(), singlequote);
if (this.out != null) { this.out.write(filtered); }
// buffer = new serverByteBuffer();
buffer.reset();
this.buffer.reset();
}
} else if (inDoubleQuote) {
buffer.append(c);
if (c == doublequote) inDoubleQuote = false;
} else if (this.inDoubleQuote) {
this.buffer.append(c);
if (c == doublequote) this.inDoubleQuote = false;
// check error cases
if (c == rb && buffer.length() > 0 && buffer.charAt(0) == lb) {
inDoubleQuote = false;
if (c == rb && this.buffer.length() > 0 && this.buffer.charAt(0) == lb) {
this.inDoubleQuote = false;
// the tag ends here. after filtering: pass on
filtered = filterSentence(buffer.getChars(), doublequote);
if (out != null) out.write(filtered);
filtered = filterSentence(this.buffer.getChars(), doublequote);
if (this.out != null) this.out.write(filtered);
// buffer = new serverByteBuffer();
buffer.reset();
this.buffer.reset();
}
} else if (inComment) {
buffer.append(c);
} else if (this.inComment) {
this.buffer.append(c);
if (c == rb &&
buffer.length() > 6 &&
buffer.charAt(buffer.length() - 3) == dash) {
this.buffer.length() > 6 &&
this.buffer.charAt(this.buffer.length() - 3) == dash) {
// comment is at end
inComment = false;
char[] comment = buffer.getChars();
if (scraper != null) scraper.scrapeComment(comment);
if (out != null) out.write(comment);
this.inComment = false;
final char[] comment = this.buffer.getChars();
if (this.scraper != null) this.scraper.scrapeComment(comment);
if (this.out != null) this.out.write(comment);
// buffer = new serverByteBuffer();
buffer.reset();
this.buffer.reset();
}
} else if (inScript) {
buffer.append(c);
final int bufferLength = buffer.length();
} else if (this.inScript) {
this.buffer.append(c);
final int bufferLength = this.buffer.length();
if ((c == rb) && (bufferLength > 14) &&
(buffer.charAt(bufferLength - 9) == lb) &&
(buffer.charAt(bufferLength - 8) == '/') &&
(buffer.charAt(bufferLength - 7) == 's') &&
(buffer.charAt(bufferLength - 6) == 'c') &&
(buffer.charAt(bufferLength - 5) == 'r') &&
(buffer.charAt(bufferLength - 4) == 'i') &&
(buffer.charAt(bufferLength - 3) == 'p') &&
(buffer.charAt(bufferLength - 2) == 't')) {
(this.buffer.charAt(bufferLength - 9) == lb) &&
(this.buffer.charAt(bufferLength - 8) == '/') &&
(this.buffer.charAt(bufferLength - 7) == 's') &&
(this.buffer.charAt(bufferLength - 6) == 'c') &&
(this.buffer.charAt(bufferLength - 5) == 'r') &&
(this.buffer.charAt(bufferLength - 4) == 'i') &&
(this.buffer.charAt(bufferLength - 3) == 'p') &&
(this.buffer.charAt(bufferLength - 2) == 't')) {
// script is at end
inScript = false;
if (out != null) out.write(buffer.getChars());
this.inScript = false;
if (this.out != null) this.out.write(this.buffer.getChars());
// buffer = new serverByteBuffer();
buffer.reset();
this.buffer.reset();
}
} else if (inStyle) {
buffer.append(c);
final int bufferLength = buffer.length();
} else if (this.inStyle) {
this.buffer.append(c);
final int bufferLength = this.buffer.length();
if ((c == rb) && (bufferLength > 13) &&
(buffer.charAt(bufferLength - 8) == lb) &&
(buffer.charAt(bufferLength - 7) == '/') &&
(buffer.charAt(bufferLength - 6) == 's') &&
(buffer.charAt(bufferLength - 5) == 't') &&
(buffer.charAt(bufferLength - 4) == 'y') &&
(buffer.charAt(bufferLength - 3) == 'l') &&
(buffer.charAt(bufferLength - 2) == 'e')) {
(this.buffer.charAt(bufferLength - 8) == lb) &&
(this.buffer.charAt(bufferLength - 7) == '/') &&
(this.buffer.charAt(bufferLength - 6) == 's') &&
(this.buffer.charAt(bufferLength - 5) == 't') &&
(this.buffer.charAt(bufferLength - 4) == 'y') &&
(this.buffer.charAt(bufferLength - 3) == 'l') &&
(this.buffer.charAt(bufferLength - 2) == 'e')) {
// style is at end
inStyle = false;
if (out != null) out.write(buffer.getChars());
this.inStyle = false;
if (this.out != null) this.out.write(this.buffer.getChars());
// buffer = new serverByteBuffer();
buffer.reset();
this.buffer.reset();
}
} else {
if (buffer.length() == 0) {
if (this.buffer.length() == 0) {
if (c == rb) {
// very strange error case; we just let it pass
if (out != null) out.write(c);
if (this.out != null) this.out.write(c);
} else {
buffer.append(c);
this.buffer.append(c);
}
} else if (buffer.length() > 0 && buffer.charAt(0) == lb) {
if (c == singlequote) inSingleQuote = true;
if (c == doublequote) inDoubleQuote = true;
} else if (this.buffer.length() > 0 && this.buffer.charAt(0) == lb) {
if (c == singlequote) this.inSingleQuote = true;
if (c == doublequote) this.inDoubleQuote = true;
// fill in tag text
if ((buffer.length() >= 3) && (buffer.charAt(1) == excl) &&
(buffer.charAt(2) == dash) && (c == dash)) {
if ((this.buffer.length() >= 3) && (this.buffer.charAt(1) == excl) &&
(this.buffer.charAt(2) == dash) && (c == dash)) {
// this is the start of a comment
inComment = true;
buffer.append(c);
} else if ((buffer.length() >= 6) &&
(buffer.charAt(1) == 's') &&
(buffer.charAt(2) == 'c') &&
(buffer.charAt(3) == 'r') &&
(buffer.charAt(4) == 'i') &&
(buffer.charAt(5) == 'p') &&
this.inComment = true;
this.buffer.append(c);
} else if ((this.buffer.length() >= 6) &&
(this.buffer.charAt(1) == 's') &&
(this.buffer.charAt(2) == 'c') &&
(this.buffer.charAt(3) == 'r') &&
(this.buffer.charAt(4) == 'i') &&
(this.buffer.charAt(5) == 'p') &&
(c == 't')) {
// this is the start of a javascript
inScript = true;
buffer.append(c);
} else if ((buffer.length() >= 5) &&
(buffer.charAt(1) == 's') &&
(buffer.charAt(2) == 't') &&
(buffer.charAt(3) == 'y') &&
(buffer.charAt(4) == 'l') &&
this.inScript = true;
this.buffer.append(c);
} else if ((this.buffer.length() >= 5) &&
(this.buffer.charAt(1) == 's') &&
(this.buffer.charAt(2) == 't') &&
(this.buffer.charAt(3) == 'y') &&
(this.buffer.charAt(4) == 'l') &&
(c == 'e')) {
// this is the start of a css-style
inStyle = true;
buffer.append(c);
this.inStyle = true;
this.buffer.append(c);
} else if (c == rb) {
buffer.append(c);
this.buffer.append(c);
// the tag ends here. after filtering: pass on
filtered = filterSentence(buffer.getChars(), doublequote);
if (out != null) out.write(filtered);
filtered = filterSentence(this.buffer.getChars(), doublequote);
if (this.out != null) this.out.write(filtered);
// buffer = new serverByteBuffer();
buffer.reset();
this.buffer.reset();
} else if (c == lb) {
// this is an error case
// we consider that there is one rb missing
if (buffer.length() > 0) {
filtered = filterSentence(buffer.getChars(), doublequote);
if (out != null) out.write(filtered);
if (this.buffer.length() > 0) {
filtered = filterSentence(this.buffer.getChars(), doublequote);
if (this.out != null) this.out.write(filtered);
}
// buffer = new serverByteBuffer();
buffer.reset();
buffer.append(c);
this.buffer.reset();
this.buffer.append(c);
} else {
buffer.append(c);
this.buffer.append(c);
}
} else {
// fill in plain text
if (c == lb) {
// the text ends here
if (buffer.length() > 0) {
filtered = filterSentence(buffer.getChars(), doublequote);
if (out != null) out.write(filtered);
if (this.buffer.length() > 0) {
filtered = filterSentence(this.buffer.getChars(), doublequote);
if (this.out != null) this.out.write(filtered);
}
// buffer = new serverByteBuffer();
buffer.reset();
buffer.append(c);
this.buffer.reset();
this.buffer.append(c);
} else {
// simply append
buffer.append(c);
this.buffer.append(c);
}
}
}
} else {
out.write(c);
this.out.write(c);
}
}
@ -529,28 +531,28 @@ public final class TransformerWriter extends Writer {
// we cannot flush the current string buffer to prevent that
// the filter process is messed up
// instead, we simply flush the underlying output stream
if (out != null) out.flush();
if (this.out != null) this.out.flush();
// if you want to flush all, call close() at end of writing;
}
public void close() throws IOException {
final char quotechar = (inSingleQuote) ? singlequote : doublequote;
if (buffer != null) {
if (buffer.length() > 0) {
final char[] filtered = filterSentence(buffer.getChars(), quotechar);
if (out != null) out.write(filtered);
final char quotechar = (this.inSingleQuote) ? singlequote : doublequote;
if (this.buffer != null) {
if (this.buffer.length() > 0) {
final char[] filtered = filterSentence(this.buffer.getChars(), quotechar);
if (this.out != null) this.out.write(filtered);
}
buffer = null;
this.buffer = null;
}
final char[] finalized = filterFinalize(quotechar);
if (out != null) {
if (finalized != null) out.write(finalized);
out.flush();
out.close();
if (this.out != null) {
if (finalized != null) this.out.write(finalized);
this.out.flush();
this.out.close();
}
filterTag = null;
filterOpts = null;
filterCont = null;
this.filterTag = null;
this.filterOpts = null;
this.filterCont = null;
// if (scraper != null) {scraper.close(); scraper = null;}
// if (transformer != null) {transformer.close(); transformer = null;}
}
@ -573,11 +575,11 @@ public final class TransformerWriter extends Writer {
}
public boolean binarySuspect() {
return !binaryUnsuspect;
return !this.binaryUnsuspect;
}
public static void main(final String[] args) {
// takes one argument: a file name
// takes one argument: a file name
if (args.length != 1) return;
// TODO: this does not work at the moment
System.out.println("this does not work at the moment");
@ -585,7 +587,7 @@ public final class TransformerWriter extends Writer {
final char[] buffer = new char[512];
try {
final ContentScraper scraper = new ContentScraper(new DigestURI("http://localhost:8090"));
final Transformer transformer = new ContentTransformer();
final Transformer transformer = new ContentTransformer();
final Reader is = new FileReader(args[0]);
final FileOutputStream fos = new FileOutputStream(new File(args[0] + ".out"));
final Writer os = new TransformerWriter(fos, UTF8.charset, scraper, transformer, false);

@ -1,4 +1,4 @@
//pdfParser.java
//pdfParser.java
//------------------------
//part of YaCy
//(C) by Michael Peter Christen; mc@yacy.net
@ -32,15 +32,6 @@ import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.io.InputStream;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.util.PDFTextStripper;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
@ -50,23 +41,38 @@ import net.yacy.document.Parser;
import net.yacy.kelondro.io.CharBuffer;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl;
import org.apache.pdfbox.cos.COSName;
import org.apache.pdfbox.exceptions.CryptographyException;
import org.apache.pdfbox.pdmodel.PDDocument;
import org.apache.pdfbox.pdmodel.PDDocumentInformation;
import org.apache.pdfbox.pdmodel.encryption.AccessPermission;
import org.apache.pdfbox.pdmodel.encryption.BadSecurityHandlerException;
import org.apache.pdfbox.pdmodel.encryption.StandardDecryptionMaterial;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.util.PDFTextStripper;
public class pdfParser extends AbstractParser implements Parser {
public pdfParser() {
public pdfParser() {
super("Acrobat Portable Document Parser");
SUPPORTED_EXTENSIONS.add("pdf");
SUPPORTED_MIME_TYPES.add("application/pdf");
SUPPORTED_MIME_TYPES.add("application/x-pdf");
SUPPORTED_MIME_TYPES.add("application/acrobat");
SUPPORTED_MIME_TYPES.add("applications/vnd.pdf");
SUPPORTED_MIME_TYPES.add("text/pdf");
SUPPORTED_MIME_TYPES.add("text/x-pdf");
this.SUPPORTED_EXTENSIONS.add("pdf");
this.SUPPORTED_MIME_TYPES.add("application/pdf");
this.SUPPORTED_MIME_TYPES.add("application/x-pdf");
this.SUPPORTED_MIME_TYPES.add("application/acrobat");
this.SUPPORTED_MIME_TYPES.add("applications/vnd.pdf");
this.SUPPORTED_MIME_TYPES.add("text/pdf");
this.SUPPORTED_MIME_TYPES.add("text/x-pdf");
}
public Document[] parse(final MultiProtocolURI location, final String mimeType, final String charset, final InputStream source) throws Parser.Failure, InterruptedException {
// check memory for parser
if (!MemoryControl.request(200 * 1024 * 1024, true))
throw new Parser.Failure("Not enough Memory available for pdf parser: " + MemoryControl.available(), location);
// create a pdf parser
PDDocument pdfDoc = null;
//final PDFParser pdfParser;
@ -76,32 +82,32 @@ public class pdfParser extends AbstractParser implements Parser {
//pdfParser = new PDFParser(source);
//pdfParser.parse();
//pdfDoc = pdfParser.getPDDocument();
} catch (IOException e) {
} catch (final IOException e) {
throw new Parser.Failure(e.getMessage(), location);
} finally {
Thread.currentThread().setPriority(Thread.NORM_PRIORITY);
}
if (pdfDoc.isEncrypted()) {
try {
pdfDoc.openProtection(new StandardDecryptionMaterial(""));
} catch (BadSecurityHandlerException e) {
try {pdfDoc.close();} catch (IOException ee) {}
} catch (final BadSecurityHandlerException e) {
try {pdfDoc.close();} catch (final IOException ee) {}
throw new Parser.Failure("Document is encrypted (1): " + e.getMessage(), location);
} catch (IOException e) {
try {pdfDoc.close();} catch (IOException ee) {}
} catch (final IOException e) {
try {pdfDoc.close();} catch (final IOException ee) {}
throw new Parser.Failure("Document is encrypted (2): " + e.getMessage(), location);
} catch (CryptographyException e) {
try {pdfDoc.close();} catch (IOException ee) {}
} catch (final CryptographyException e) {
try {pdfDoc.close();} catch (final IOException ee) {}
throw new Parser.Failure("Document is encrypted (3): " + e.getMessage(), location);
}
final AccessPermission perm = pdfDoc.getCurrentAccessPermission();
if (perm == null || !perm.canExtractContent())
throw new Parser.Failure("Document is encrypted and cannot decrypted", location);
throw new Parser.Failure("Document is encrypted and cannot be decrypted", location);
}
// extracting some metadata
final PDDocumentInformation info = pdfDoc.getDocumentInformation();
final PDDocumentInformation info = pdfDoc.getDocumentInformation();
String docTitle = null, docSubject = null, docAuthor = null, docPublisher = null, docKeywordStr = null;
if (info != null) {
docTitle = info.getTitle();
@ -115,7 +121,7 @@ public class pdfParser extends AbstractParser implements Parser {
// info.getCreationDate());
// info.getModificationDate();
}
if (docTitle == null || docTitle.length() == 0) {
docTitle = MultiProtocolURI.unescape(location.getFileName());
}
@ -128,13 +134,13 @@ public class pdfParser extends AbstractParser implements Parser {
stripper.writeText(pdfDoc, writer); // may throw a NPE
pdfDoc.close();
writer.close();
} catch (IOException e) {
} catch (final IOException e) {
// close the writer
if (writer != null) try { writer.close(); } catch (final Exception ex) {}
try {pdfDoc.close();} catch (IOException ee) {}
try {pdfDoc.close();} catch (final IOException ee) {}
throw new Parser.Failure(e.getMessage(), location);
} finally {
try {pdfDoc.close();} catch (IOException e) {}
try {pdfDoc.close();} catch (final IOException e) {}
}
pdfDoc = null;
@ -145,7 +151,7 @@ public class pdfParser extends AbstractParser implements Parser {
if (docTitle == null) {
docTitle = docSubject;
}
byte[] contentBytes;
contentBytes = UTF8.getBytes(writer.toString());
@ -173,7 +179,7 @@ public class pdfParser extends AbstractParser implements Parser {
docPublisher,
null,
null,
0.0f, 0.0f,
0.0f, 0.0f,
contentBytes,
null,
null,
@ -190,10 +196,10 @@ public class pdfParser extends AbstractParser implements Parser {
// file
final File pdfFile = new File(args[0]);
if(pdfFile.canRead()) {
System.out.println(pdfFile.getAbsolutePath());
final long startTime = System.currentTimeMillis();
// parse
final AbstractParser parser = new pdfParser();
Document document = null;
@ -207,13 +213,13 @@ public class pdfParser extends AbstractParser implements Parser {
Log.logException(e);
} catch (final NoClassDefFoundError e) {
System.err.println("class not found: " + e.getMessage());
} catch (FileNotFoundException e) {
} catch (final FileNotFoundException e) {
Log.logException(e);
}
// statistics
System.out.println("\ttime elapsed: " + (System.currentTimeMillis() - startTime) + " ms");
// output
if (document == null) {
System.out.println("\t!!!Parsing without result!!!");

@ -11,12 +11,12 @@
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
@ -38,35 +38,40 @@ import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.TextParser;
import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.MemoryControl;
// this is a new implementation of this parser idiom using multiple documents as result set
public class zipParser extends AbstractParser implements Parser {
public zipParser() {
public zipParser() {
super("ZIP File Parser");
SUPPORTED_EXTENSIONS.add("zip");
SUPPORTED_EXTENSIONS.add("jar");
SUPPORTED_EXTENSIONS.add("apk"); // Android package
SUPPORTED_MIME_TYPES.add("application/zip");
SUPPORTED_MIME_TYPES.add("application/x-zip");
SUPPORTED_MIME_TYPES.add("application/x-zip-compressed");
SUPPORTED_MIME_TYPES.add("application/x-compress");
SUPPORTED_MIME_TYPES.add("application/x-compressed");
SUPPORTED_MIME_TYPES.add("multipart/x-zip");
SUPPORTED_MIME_TYPES.add("application/java-archive");
SUPPORTED_MIME_TYPES.add("application/vnd.android.package-archive");
this.SUPPORTED_EXTENSIONS.add("zip");
this.SUPPORTED_EXTENSIONS.add("jar");
this.SUPPORTED_EXTENSIONS.add("apk"); // Android package
this.SUPPORTED_MIME_TYPES.add("application/zip");
this.SUPPORTED_MIME_TYPES.add("application/x-zip");
this.SUPPORTED_MIME_TYPES.add("application/x-zip-compressed");
this.SUPPORTED_MIME_TYPES.add("application/x-compress");
this.SUPPORTED_MIME_TYPES.add("application/x-compressed");
this.SUPPORTED_MIME_TYPES.add("multipart/x-zip");
this.SUPPORTED_MIME_TYPES.add("application/java-archive");
this.SUPPORTED_MIME_TYPES.add("application/vnd.android.package-archive");
}
public Document[] parse(final MultiProtocolURI url, final String mimeType,
final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {
Document[] docs = null;
// check memory for parser
if (!MemoryControl.request(200 * 1024 * 1024, true))
throw new Parser.Failure("Not enough Memory available for zip parser: " + MemoryControl.available(), url);
Document[] docs = null;
final List<Document> docacc = new ArrayList<Document>();
ZipEntry entry;
final ZipInputStream zis = new ZipInputStream(source);
final ZipInputStream zis = new ZipInputStream(source);
File tmp = null;
// loop through the elements in the zip file and parse every single file inside
while (true) {
try {
@ -74,22 +79,24 @@ public class zipParser extends AbstractParser implements Parser {
entry = zis.getNextEntry();
if (entry == null) break;
if (entry.isDirectory() || entry.getSize() <= 0) continue;
final String name = entry.getName();
final String name = entry.getName();
final int idx = name.lastIndexOf('.');
final String mime = TextParser.mimeOf((idx >= 0) ? name.substring(idx + 1) : "");
try {
tmp = FileUtils.createTempFile(this.getClass(), name);
FileUtils.copy(zis, tmp, entry.getSize());
docs = TextParser.parseSource(MultiProtocolURI.newURL(url, "#" + name), mime, null, tmp);
FileUtils.copy(zis, tmp, entry.getSize());
final MultiProtocolURI virtualURL = MultiProtocolURI.newURL(url, "#" + name);
//this.log.logInfo("ZIP file parser: " + virtualURL.toNormalform(false, false));
docs = TextParser.parseSource(virtualURL, mime, null, tmp);
if (docs == null) continue;
for (final Document d: docs) docacc.add(d);
} catch (final Parser.Failure e) {
log.logWarning("ZIP parser entry " + name + ": " + e.getMessage());
this.log.logWarning("ZIP parser entry " + name + ": " + e.getMessage());
} finally {
if (tmp != null) FileUtils.deletedelete(tmp);
}
} catch (IOException e) {
log.logWarning("ZIP parser:" + e.getMessage());
} catch (final IOException e) {
this.log.logWarning("ZIP parser:" + e.getMessage());
break;
}
}

@ -597,7 +597,7 @@ public class ArrayStack implements BLOB {
* @return
* @throws IOException
*/
public synchronized byte[] get(byte[] key) throws IOException, RowSpaceExceededException {
public byte[] get(byte[] key) throws IOException, RowSpaceExceededException {
if (blobs.size() == 0) return null;
if (blobs.size() == 1) {
blobItem bi = blobs.get(0);

@ -9,7 +9,7 @@
// $LastChangedBy$
//
// LICENSE
//
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
@ -34,12 +34,9 @@ import java.io.InputStream;
import java.io.OutputStream;
import java.util.Map;
import java.util.TreeMap;
import java.util.concurrent.BlockingQueue;
import java.util.concurrent.LinkedBlockingQueue;
import java.util.zip.GZIPInputStream;
import java.util.zip.GZIPOutputStream;
import net.yacy.cora.document.ASCII;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.order.ByteOrder;
@ -52,89 +49,35 @@ public class Compressor implements BLOB {
static byte[] gzipMagic = {(byte) 'z', (byte) '|'}; // magic for gzip-encoded content
static byte[] plainMagic = {(byte) 'p', (byte) '|'}; // magic for plain content (no encoding)
private final BLOB backend;
private TreeMap<byte[], byte[]> buffer; // entries which are not yet compressed, format is RAW (without magic)
private BlockingQueue<Entity> writeQueue;
private long bufferlength;
private final long maxbufferlength;
private final Worker[] worker;
public Compressor(BLOB backend, long buffersize) {
public Compressor(final BLOB backend, final long buffersize) {
this.backend = backend;
this.maxbufferlength = buffersize;
this.writeQueue = new LinkedBlockingQueue<Entity>();
this.worker = new Worker[Math.min(4, Runtime.getRuntime().availableProcessors())];
for (int i = 0; i < this.worker.length; i++) {
this.worker[i] = new Worker();
this.worker[i].start();
}
initBuffer();
}
public long mem() {
return backend.mem();
return this.backend.mem();
}
public void trim() {
this.backend.trim();
}
private static class Entity implements Map.Entry<byte[], byte[]> {
private byte[] key;
private byte[] payload;
public Entity(byte[] key, byte[] payload) {
this.key = key;
this.payload = payload;
}
public byte[] getKey() {
return this.key;
}
public byte[] getValue() {
return this.payload;
}
public byte[] setValue(byte[] payload) {
byte[] payload0 = payload;
this.payload = payload;
return payload0;
}
}
private final static Entity poisonWorkerEntry = new Entity(ASCII.getBytes("poison"), null);
private class Worker extends Thread {
public Worker() {
}
@Override
public void run() {
Entity entry;
try {
while ((entry = writeQueue.take()) != poisonWorkerEntry) {
try {
Compressor.this.backend.insert(entry.getKey(), compress(entry.getValue()));
} catch (IOException e) {
Log.logException(e);
buffer.put(entry.getKey(), entry.getValue());
}
}
} catch (InterruptedException e) {
Log.logException(e);
}
}
}
public String name() {
return this.backend.name();
}
public synchronized void clear() throws IOException {
initBuffer();
this.writeQueue.clear();
this.backend.clear();
}
private void initBuffer() {
this.buffer = new TreeMap<byte[], byte[]>(this.backend.ordering());
this.bufferlength = 0;
@ -143,32 +86,22 @@ public class Compressor implements BLOB {
public ByteOrder ordering() {
return this.backend.ordering();
}
public synchronized void close(boolean writeIDX) {
public synchronized void close(final boolean writeIDX) {
// no more thread is running, flush all queues
flushAll();
for (int i = 0; i < this.worker.length; i++) try {
this.writeQueue.put(poisonWorkerEntry);
} catch (InterruptedException e) {
Log.logException(e);
}
for (int i = 0; i < this.worker.length; i++) try {
this.worker[i].join();
} catch (InterruptedException e) {
Log.logException(e);
}
this.backend.close(writeIDX);
}
private static byte[] compress(byte[] b) {
int l = b.length;
private static byte[] compress(final byte[] b) {
final int l = b.length;
if (l < 100) return markWithPlainMagic(b);
byte[] bb = compressAddMagic(b);
final byte[] bb = compressAddMagic(b);
if (bb.length >= l) return markWithPlainMagic(b);
return bb;
}
private static byte[] compressAddMagic(byte[] b) {
private static byte[] compressAddMagic(final byte[] b) {
// compress a byte array and add a leading magic for the compression
try {
//System.out.print("/(" + cdr + ")"); // DEBUG
@ -179,27 +112,27 @@ public class Compressor implements BLOB {
os.close();
baos.close();
return baos.toByteArray();
} catch (IOException e) {
} catch (final IOException e) {
Log.logSevere("Compressor", "", e);
return null;
}
}
private static byte[] markWithPlainMagic(byte[] b) {
private static byte[] markWithPlainMagic(final byte[] b) {
//System.out.print("+"); // DEBUG
byte[] r = new byte[b.length + 2];
final byte[] r = new byte[b.length + 2];
r[0] = plainMagic[0];
r[1] = plainMagic[1];
System.arraycopy(b, 0, r, 2, b.length);
return r;
}
private static byte[] decompress(byte[] b) {
private static byte[] decompress(final byte[] b) {
// use a magic in the head of the bytes to identify compression type
if (b == null) return null;
if (ByteArray.startsWith(b, gzipMagic)) {
//System.out.print("\\"); // DEBUG
ByteArrayInputStream bais = new ByteArrayInputStream(b);
final ByteArrayInputStream bais = new ByteArrayInputStream(b);
// eat up the magic
bais.read();
bais.read();
@ -208,21 +141,21 @@ public class Compressor implements BLOB {
try {
gis = new GZIPInputStream(bais);
final ByteArrayOutputStream baos = new ByteArrayOutputStream(b.length);
final byte[] buf = new byte[1024];
final byte[] buf = new byte[1024 * 4];
int n;
while ((n = gis.read(buf)) > 0) baos.write(buf, 0, n);
gis.close();
bais.close();
baos.close();
return baos.toByteArray();
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
return null;
}
} else if (ByteArray.startsWith(b, plainMagic)) {
//System.out.print("-"); // DEBUG
byte[] r = new byte[b.length - 2];
final byte[] r = new byte[b.length - 2];
System.arraycopy(b, 2, r, 0, b.length - 2);
return r;
} else {
@ -231,44 +164,41 @@ public class Compressor implements BLOB {
}
}
public byte[] get(byte[] key) throws IOException, RowSpaceExceededException {
public byte[] get(final byte[] key) throws IOException, RowSpaceExceededException {
// depending on the source of the result, we additionally do entry compression
// because if a document was read once, we think that it will not be retrieved another time again soon
byte[] b = null;
synchronized (this) {
b = buffer.remove(key);
b = this.buffer.remove(key);
if (b != null) {
// compress the entry now and put it to the backend
try {
this.writeQueue.put(new Entity(key, b));
this.bufferlength = this.bufferlength - b.length;
return b;
} catch (InterruptedException e) {
Log.logException(e);
buffer.put(key, b);
}
this.backend.insert(key, compress(b));
this.bufferlength = this.bufferlength - b.length;
return b;
}
// return from the backend
b = this.backend.get(key);
}
// return from the backend
b = this.backend.get(key);
if (b == null) return null;
if (!MemoryControl.request(b.length * 2, true)) {
throw new RowSpaceExceededException(b.length * 2, "decompress needs 2 * " + b.length + " bytes");
}
return decompress(b);
}
public byte[] get(Object key) {
public byte[] get(final Object key) {
if (!(key instanceof byte[])) return null;
try {
return get((byte[]) key);
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
} catch (RowSpaceExceededException e) {
} catch (final RowSpaceExceededException e) {
Log.logException(e);
}
return null;
}
public boolean containsKey(byte[] key) {
public boolean containsKey(final byte[] key) {
synchronized (this) {
return this.buffer.containsKey(key) || this.backend.containsKey(key);
}
@ -281,13 +211,13 @@ public class Compressor implements BLOB {
public synchronized long length() {
try {
return this.backend.length() + this.bufferlength;
} catch (IOException e) {
} catch (final IOException e) {
Log.logException(e);
return 0;
}
}
public long length(byte[] key) throws IOException {
public long length(final byte[] key) throws IOException {
synchronized (this) {
byte[] b = this.buffer.get(key);
if (b != null) return b.length;
@ -296,23 +226,23 @@ public class Compressor implements BLOB {
if (b == null) return 0;
b = decompress(b);
return (b == null) ? 0 : b.length;
} catch (RowSpaceExceededException e) {
} catch (final RowSpaceExceededException e) {
throw new IOException(e.getMessage());
}
}
}
private int removeFromQueues(byte[] key) {
byte[] b = this.buffer.remove(key);
private int removeFromQueues(final byte[] key) {
final byte[] b = this.buffer.remove(key);
if (b != null) return b.length;
return 0;
}
public void insert(byte[] key, byte[] b) throws IOException {
public void insert(final byte[] key, final byte[] b) throws IOException {
// first ensure that the files do not exist anywhere
delete(key);
// check if the buffer is full or could be full after this write
if (this.bufferlength + b.length * 2 > this.maxbufferlength) synchronized (this) {
// in case that we compress, just compress as much as is necessary to get enough room
@ -331,46 +261,46 @@ public class Compressor implements BLOB {
this.buffer.put(key, b);
this.bufferlength += b.length;
}
if (MemoryControl.shortStatus()) flushAll();
}
public synchronized void delete(byte[] key) throws IOException {
public synchronized void delete(final byte[] key) throws IOException {
this.backend.delete(key);
long rx = removeFromQueues(key);
final long rx = removeFromQueues(key);
if (rx > 0) this.bufferlength -= rx;
}
public synchronized int size() {
return this.backend.size() + this.buffer.size();
}
public synchronized boolean isEmpty() {
if (!this.backend.isEmpty()) return false;
if (!this.buffer.isEmpty()) return false;
return true;
}
public synchronized CloneableIterator<byte[]> keys(boolean up, boolean rotating) throws IOException {
public synchronized CloneableIterator<byte[]> keys(final boolean up, final boolean rotating) throws IOException {
flushAll();
return this.backend.keys(up, rotating);
}
public synchronized CloneableIterator<byte[]> keys(boolean up, byte[] firstKey) throws IOException {
public synchronized CloneableIterator<byte[]> keys(final boolean up, final byte[] firstKey) throws IOException {
flushAll();
return this.backend.keys(up, firstKey);
}
private boolean flushOne() {
if (this.buffer.isEmpty()) return false;
// depending on process case, write it to the file or compress it to the other queue
Map.Entry<byte[], byte[]> entry = this.buffer.entrySet().iterator().next();
final Map.Entry<byte[], byte[]> entry = this.buffer.entrySet().iterator().next();
this.buffer.remove(entry.getKey());
try {
this.writeQueue.put(new Entity(entry.getKey(), entry.getValue()));
this.backend.insert(entry.getKey(), compress(entry.getValue()));
this.bufferlength -= entry.getValue().length;
return true;
} catch (InterruptedException e) {
} catch (final IOException e) {
this.buffer.put(entry.getKey(), entry.getValue());
return false;
}
@ -382,25 +312,25 @@ public class Compressor implements BLOB {
}
}
public int replace(byte[] key, Rewriter rewriter) throws IOException, RowSpaceExceededException {
byte[] b = get(key);
public int replace(final byte[] key, final Rewriter rewriter) throws IOException, RowSpaceExceededException {
final byte[] b = get(key);
if (b == null) return 0;
byte[] c = rewriter.rewrite(b);
int reduction = c.length - b.length;
final byte[] c = rewriter.rewrite(b);
final int reduction = c.length - b.length;
assert reduction >= 0;
if (reduction == 0) return 0;
this.insert(key, c);
insert(key, c);
return reduction;
}
public int reduce(byte[] key, Reducer reducer) throws IOException, RowSpaceExceededException {
byte[] b = get(key);
public int reduce(final byte[] key, final Reducer reducer) throws IOException, RowSpaceExceededException {
final byte[] b = get(key);
if (b == null) return 0;
byte[] c = reducer.rewrite(b);
int reduction = c.length - b.length;
final byte[] c = reducer.rewrite(b);
final int reduction = c.length - b.length;
assert reduction >= 0;
if (reduction == 0) return 0;
this.insert(key, c);
insert(key, c);
return reduction;
}

@ -289,7 +289,13 @@ public class MapHeap implements Map<byte[], Map<String, String>> {
cache.clear();
}
Map<String, String> map;
// if we have the entry in the cache then just return that
Map<String, String> map = cache.get(key);
if (map != null) return map;
// in all other cases we must look into the cache again within
// a synchronization in case that the entry was not in the cache but stored
// there while another process has taken it from the file system
if (storeCache) {
synchronized (this) {
map = cache.get(key);

@ -9,7 +9,7 @@
// $LastChangedBy$
//
// LICENSE
//
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
@ -35,6 +35,7 @@ import java.util.concurrent.Semaphore;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.UTF8;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.index.Row.Entry;
import net.yacy.kelondro.order.Base64Order;
import net.yacy.kelondro.order.Bitfield;
@ -43,7 +44,6 @@ import net.yacy.kelondro.rwi.AbstractReference;
import net.yacy.kelondro.rwi.Reference;
import net.yacy.kelondro.rwi.ReferenceContainer;
import net.yacy.kelondro.util.ByteArray;
import net.yacy.kelondro.index.Row;
public class WordReferenceVars extends AbstractReference implements WordReference, Reference, Cloneable, Comparable<WordReferenceVars>, Comparator<WordReferenceVars> {
@ -54,7 +54,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
public static final WordReferenceVars poison = new WordReferenceVars();
private static int cores = Runtime.getRuntime().availableProcessors();
public static final byte[] default_language = UTF8.getBytes("uk");
public Bitfield flags;
public long lastModified;
public byte[] language;
@ -67,7 +67,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
wordsintext, wordsintitle;
private final ConcurrentLinkedQueue<Integer> positions;
public double termFrequency;
public WordReferenceVars(
final byte[] urlHash,
final int urlLength, // byte-length of complete URL
@ -76,7 +76,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
final int hitcount, // how often appears this word in the text
final int wordcount, // total number of words
final int phrasecount, // total number of phrases
final ConcurrentLinkedQueue<Integer> ps, // positions of words that are joined into the reference
final ConcurrentLinkedQueue<Integer> ps, // positions of words that are joined into the reference
final int posinphrase, // position of word in its phrase
final int posofphrase, // number of the phrase where word appears
final long lastmodified, // last-modified time of the document where word appears
@ -102,7 +102,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
this.lother = outlinksOther;
this.phrasesintext = phrasecount;
this.positions = new ConcurrentLinkedQueue<Integer>();
for (Integer i: ps) this.positions.add(i);
if (ps.size() > 0) for (final Integer i: ps) this.positions.add(i);
this.posinphrase = posinphrase;
this.posofphrase = posofphrase;
this.urlcomps = urlComps;
@ -112,7 +112,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
this.wordsintitle = titleLength;
this.termFrequency = termfrequency;
}
public WordReferenceVars(final WordReference e) {
this.flags = e.flags();
//this.freshUntil = e.freshUntil();
@ -125,7 +125,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
this.lother = e.lother();
this.phrasesintext = e.phrasesintext();
this.positions = new ConcurrentLinkedQueue<Integer>();
for (Integer i: e.positions()) this.positions.add(i);
if (e.positions().size() > 0) for (final Integer i: e.positions()) this.positions.add(i);
this.posinphrase = e.posinphrase();
this.posofphrase = e.posofphrase();
this.urlcomps = e.urlcomps();
@ -135,7 +135,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
this.wordsintitle = e.wordsintitle();
this.termFrequency = e.termFrequency();
}
/**
* initializer for special poison object
*/
@ -159,7 +159,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
this.wordsintitle = 0;
this.termFrequency = 0.0;
}
@Override
public WordReferenceVars clone() {
final WordReferenceVars c = new WordReferenceVars(
@ -183,7 +183,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
this.termFrequency);
return c;
}
public void join(final WordReferenceVars v) {
// combine the distance
this.positions.addAll(v.positions);
@ -196,71 +196,71 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
}
public Bitfield flags() {
return flags;
return this.flags;
}
public byte[] getLanguage() {
return language;
return this.language;
}
public char getType() {
return type;
return this.type;
}
public int hitcount() {
return hitcount;
return this.hitcount;
}
public long lastModified() {
return lastModified;
return this.lastModified;
}
public int llocal() {
return llocal;
return this.llocal;
}
public int lother() {
return lother;
return this.lother;
}
public int phrasesintext() {
return phrasesintext;
return this.phrasesintext;
}
public int posinphrase() {
return posinphrase;
return this.posinphrase;
}
public Collection<Integer> positions() {
return this.positions;
}
public int posofphrase() {
return posofphrase;
return this.posofphrase;
}
public WordReferenceRow toRowEntry() {
return new WordReferenceRow(
urlHash,
urllength, // byte-length of complete URL
urlcomps, // number of path components
wordsintitle, // length of description/length (longer are better?)
hitcount, // how often appears this word in the text
wordsintext, // total number of words
phrasesintext, // total number of phrases
positions.isEmpty() ? 1 : positions.iterator().next(), // position of word in all words
posinphrase, // position of word in its phrase
posofphrase, // number of the phrase where word appears
lastModified, // last-modified time of the document where word appears
this.urlHash,
this.urllength, // byte-length of complete URL
this.urlcomps, // number of path components
this.wordsintitle, // length of description/length (longer are better?)
this.hitcount, // how often appears this word in the text
this.wordsintext, // total number of words
this.phrasesintext, // total number of phrases
this.positions.isEmpty() ? 1 : this.positions.iterator().next(), // position of word in all words
this.posinphrase, // position of word in its phrase
this.posofphrase, // number of the phrase where word appears
this.lastModified, // last-modified time of the document where word appears
System.currentTimeMillis(), // update time;
language, // (guessed) language of document
type, // type of document
llocal, // outlinks to same domain
lother, // outlinks to other domain
flags // attributes to the url and to the word according the url
this.language, // (guessed) language of document
this.type, // type of document
this.llocal, // outlinks to same domain
this.lother, // outlinks to other domain
this.flags // attributes to the url and to the word according the url
);
}
public Entry toKelondroEntry() {
return toRowEntry().toKelondroEntry();
}
@ -270,40 +270,40 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
}
public byte[] urlhash() {
return urlHash;
return this.urlHash;
}
public String hosthash() {
if (hostHash != null) return hostHash;
hostHash = ASCII.String(urlHash, 6, 6);
return hostHash;
if (this.hostHash != null) return this.hostHash;
this.hostHash = ASCII.String(this.urlHash, 6, 6);
return this.hostHash;
}
public int urlcomps() {
return urlcomps;
return this.urlcomps;
}
public int urllength() {
return urllength;
return this.urllength;
}
public int virtualAge() {
return virtualAge;
return this.virtualAge;
}
public int wordsintext() {
return wordsintext;
return this.wordsintext;
}
public int wordsintitle() {
return wordsintitle;
return this.wordsintitle;
}
public double termFrequency() {
if (this.termFrequency == 0.0) this.termFrequency = (((double) this.hitcount()) / ((double) (this.wordsintext() + this.wordsintitle() + 1)));
if (this.termFrequency == 0.0) this.termFrequency = (((double) hitcount()) / ((double) (wordsintext() + wordsintitle() + 1)));
return this.termFrequency;
}
public final void min(final WordReferenceVars other) {
if (other == null) return;
int v;
@ -325,7 +325,7 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
if (this.wordsintitle > (v = other.wordsintitle)) this.wordsintitle = v;
if (this.termFrequency > (d = other.termFrequency)) this.termFrequency = d;
}
public final void max(final WordReferenceVars other) {
if (other == null) return;
int v;
@ -350,10 +350,10 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
public void join(final Reference r) {
// joins two entries into one entry
// combine the distance
WordReference oe = (WordReference) r;
for (Integer i: r.positions()) this.positions.add(i);
final WordReference oe = (WordReference) r;
for (final Integer i: r.positions()) this.positions.add(i);
this.posinphrase = (this.posofphrase == oe.posofphrase()) ? Math.min(this.posinphrase, oe.posinphrase()) : 0;
this.posofphrase = Math.min(this.posofphrase, oe.posofphrase());
@ -367,80 +367,80 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
if (this == obj) return true;
if (obj == null) return false;
if (!(obj instanceof WordReferenceVars)) return false;
WordReferenceVars other = (WordReferenceVars) obj;
final WordReferenceVars other = (WordReferenceVars) obj;
return Base64Order.enhancedCoder.equal(this.urlHash, other.urlHash);
}
@Override
public int hashCode() {
return ByteArray.hashCode(this.urlHash);
}
public int compareTo(final WordReferenceVars o) {
return Base64Order.enhancedCoder.compare(this.urlHash, o.urlhash());
}
public int compare(WordReferenceVars o1, WordReferenceVars o2) {
public int compare(final WordReferenceVars o1, final WordReferenceVars o2) {
return o1.compareTo(o2);
}
public void addPosition(final int position) {
this.positions.add(position);
}
/**
* transform a reference container into a stream of parsed entries
* @param container
* @return a blocking queue filled with WordReferenceVars that is still filled when the object is returned
*/
public static BlockingQueue<WordReferenceVars> transform(ReferenceContainer<WordReference> container) {
LinkedBlockingQueue<WordReferenceVars> vars = new LinkedBlockingQueue<WordReferenceVars>();
public static BlockingQueue<WordReferenceVars> transform(final ReferenceContainer<WordReference> container) {
final LinkedBlockingQueue<WordReferenceVars> vars = new LinkedBlockingQueue<WordReferenceVars>();
if (container.size() <= 100) {
// transform without concurrency to omit thread creation overhead
for (Row.Entry entry: container) try {
for (final Row.Entry entry: container) try {
vars.put(new WordReferenceVars(new WordReferenceRow(entry)));
} catch (InterruptedException e) {}
} catch (final InterruptedException e) {}
try {
vars.put(WordReferenceVars.poison);
} catch (InterruptedException e) {}
} catch (final InterruptedException e) {}
return vars;
}
Thread distributor = new TransformDistributor(container, vars);
final Thread distributor = new TransformDistributor(container, vars);
distributor.start();
// return the resulting queue while the processing queues are still working
return vars;
}
public static class TransformDistributor extends Thread {
ReferenceContainer<WordReference> container;
BlockingQueue<WordReferenceVars> out;
public TransformDistributor(ReferenceContainer<WordReference> container, BlockingQueue<WordReferenceVars> out) {
public TransformDistributor(final ReferenceContainer<WordReference> container, final BlockingQueue<WordReferenceVars> out) {
this.container = container;
this.out = out;
}
@Override
public void run() {
// start the transformation threads
int cores0 = Math.min(cores, container.size() / 100) + 1;
Semaphore termination = new Semaphore(cores0);
TransformWorker[] worker = new TransformWorker[cores0];
final int cores0 = Math.min(cores, this.container.size() / 100) + 1;
final Semaphore termination = new Semaphore(cores0);
final TransformWorker[] worker = new TransformWorker[cores0];
for (int i = 0; i < cores0; i++) {
worker[i] = new TransformWorker(out, termination);
worker[i] = new TransformWorker(this.out, termination);
worker[i].start();
}
// fill the queue
int p = container.size();
int p = this.container.size();
while (p > 0) {
p--;
worker[p % cores0].add(container.get(p, false));
worker[p % cores0].add(this.container.get(p, false));
}
// insert poison to stop the queues
for (int i = 0; i < cores0; i++) worker[i].add(WordReferenceRow.poisonRowEntry);
}
@ -451,32 +451,32 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
BlockingQueue<Row.Entry> in;
BlockingQueue<WordReferenceVars> out;
Semaphore termination;
public TransformWorker(final BlockingQueue<WordReferenceVars> out, Semaphore termination) {
public TransformWorker(final BlockingQueue<WordReferenceVars> out, final Semaphore termination) {
this.in = new LinkedBlockingQueue<Row.Entry>();
this.out = out;
this.termination = termination;
}
public void add(Row.Entry entry) {
public void add(final Row.Entry entry) {
try {
in.put(entry);
} catch (InterruptedException e) {
this.in.put(entry);
} catch (final InterruptedException e) {
}
}
@Override
public void run() {
Row.Entry entry;
try {
while ((entry = in.take()) != WordReferenceRow.poisonRowEntry) out.put(new WordReferenceVars(new WordReferenceRow(entry)));
} catch (InterruptedException e) {}
while ((entry = this.in.take()) != WordReferenceRow.poisonRowEntry) this.out.put(new WordReferenceVars(new WordReferenceRow(entry)));
} catch (final InterruptedException e) {}
// insert poison to signal the termination to next queue
try {
this.termination.acquire();
if (this.termination.availablePermits() == 0) this.out.put(WordReferenceVars.poison);
} catch (InterruptedException e) {}
} catch (final InterruptedException e) {}
}
}

@ -2,7 +2,7 @@
* RowSet
* Copyright 2006 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
* First released 20.06.2006 at http://yacy.net
*
*
* $LastChangedDate$
* $LastChangedRevision$
* $LastChangedBy$
@ -11,12 +11,12 @@
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
@ -52,17 +52,17 @@ public class RowSet extends RowCollection implements Index, Iterable<Row.Entry>
super(rowdef, objectCount, cache, sortBound);
assert rowdef.objectOrder != null;
}
public RowSet(final Row rowdef, final int objectCount) throws RowSpaceExceededException {
super(rowdef, objectCount);
assert rowdef.objectOrder != null;
}
public RowSet(final Row rowdef) {
super(rowdef);
assert rowdef.objectOrder != null;
}
/**
* import an exported collection
* @param rowdef
@ -73,7 +73,7 @@ public class RowSet extends RowCollection implements Index, Iterable<Row.Entry>
super(rowdef, exportedCollectionRowEnvironment);
assert rowdef.objectOrder != null;
}
public final static RowSet importRowSet(final byte[] b, final Row rowdef) throws RowSpaceExceededException {
assert b.length >= exportOverheadSize : "b.length = " + b.length;
if (b.length < exportOverheadSize) return new RowSet(rowdef);
@ -83,14 +83,16 @@ public class RowSet extends RowCollection implements Index, Iterable<Row.Entry>
final int orderbound = (int) NaturalOrder.decodeLong(b, 10, 4);
assert orderbound >= 0 : "orderbound = " + orderbound;
if (orderbound < 0) return new RowSet(rowdef); // error
long alloc = ((long) size) * ((long) rowdef.objectsize);
final long alloc = ((long) size) * ((long) rowdef.objectsize);
assert alloc <= Integer.MAX_VALUE : "alloc = " + alloc;
if (alloc > Integer.MAX_VALUE) return null;
assert alloc == b.length - exportOverheadSize;
if (alloc != b.length - exportOverheadSize) return null;
MemoryControl.request((int) alloc, true);
final byte[] chunkcache;
try {
chunkcache = new byte[(int) alloc];
} catch (OutOfMemoryError e) {
} catch (final OutOfMemoryError e) {
throw new RowSpaceExceededException((int) alloc, "importRowSet");
}
//assert b.length - exportOverheadSize == size * rowdef.objectsize : "b.length = " + b.length + ", size * rowdef.objectsize = " + size * rowdef.objectsize;
@ -101,19 +103,19 @@ public class RowSet extends RowCollection implements Index, Iterable<Row.Entry>
System.arraycopy(b, (int) exportOverheadSize, chunkcache, 0, chunkcache.length);
return new RowSet(rowdef, size, chunkcache, orderbound);
}
public final static int importRowCount(final long blength, final Row rowdef) {
assert blength >= exportOverheadSize : "blength = " + blength;
if (blength < exportOverheadSize) return 0;
int c = (int) ((blength - exportOverheadSize) / (long) rowdef.objectsize);
final int c = (int) ((blength - exportOverheadSize) / rowdef.objectsize);
assert c >= 0;
return c;
}
private RowSet(Row rowdef, byte[] chunkcache, int chunkcount, int sortBound, long lastTimeWrote) {
private RowSet(final Row rowdef, final byte[] chunkcache, final int chunkcount, final int sortBound, final long lastTimeWrote) {
super(rowdef, chunkcache, chunkcount, sortBound, lastTimeWrote);
}
public RowSet clone() {
return new RowSet(super.rowdef, super.chunkcache, super.chunkcount, super.sortBound, super.lastTimeWrote);
}
@ -121,13 +123,13 @@ public class RowSet extends RowCollection implements Index, Iterable<Row.Entry>
public void reset() {
super.reset();
}
public final synchronized boolean has(final byte[] key) {
assert key.length == this.rowdef.primaryKeyLength;
final int index = find(key, 0);
return index >= 0;
}
public final synchronized Row.Entry get(final byte[] key) {
assert key.length == this.rowdef.primaryKeyLength;
final int index = find(key, 0);
@ -135,20 +137,20 @@ public class RowSet extends RowCollection implements Index, Iterable<Row.Entry>
return get(index, true);
}
public Map<byte[], Row.Entry> get(Collection<byte[]> keys) throws IOException, InterruptedException {
final Map<byte[], Row.Entry> map = new TreeMap<byte[], Row.Entry>(this.row().objectOrder);
public Map<byte[], Row.Entry> get(final Collection<byte[]> keys) throws IOException, InterruptedException {
final Map<byte[], Row.Entry> map = new TreeMap<byte[], Row.Entry>(row().objectOrder);
Row.Entry entry;
for (byte[] key: keys) {
for (final byte[] key: keys) {
entry = get(key);
if (entry != null) map.put(key, entry);
}
return map;
}
/**
* Adds the row to the index. The row is identified by the primary key of the row.
* @param row a index row
* @return true if this set did _not_ already contain the given row.
* @return true if this set did _not_ already contain the given row.
* @throws IOException
* @throws RowSpaceExceededException
*/
@ -251,39 +253,39 @@ public class RowSet extends RowCollection implements Index, Iterable<Row.Entry>
private final int find(final byte[] a, final int astart) {
// returns the chunknumber; -1 if not found
if (rowdef.objectOrder == null) return iterativeSearch(a, astart, 0, this.chunkcount);
if (this.rowdef.objectOrder == null) return iterativeSearch(a, astart, 0, this.chunkcount);
if ((this.chunkcount - this.sortBound) > collectionReSortLimit) {
sort();
}
if (this.rowdef.objectOrder != null && this.rowdef.objectOrder instanceof Base64Order) {
// first try to find in sorted area
assert this.rowdef.objectOrder.wellformed(a, astart, this.rowdef.primaryKeyLength) : "not wellformed: " + UTF8.String(a, astart, this.rowdef.primaryKeyLength);
}
// first try to find in sorted area
final int p = binarySearch(a, astart);
if (p >= 0) return p;
// then find in unsorted area
return iterativeSearch(a, astart, this.sortBound, this.chunkcount);
}
private final int iterativeSearch(final byte[] key, final int astart, final int leftBorder, final int rightBound) {
// returns the chunknumber
// returns the chunknumber
for (int i = leftBorder; i < rightBound; i++) {
assert key.length - astart >= this.rowdef.primaryKeyLength;
if (match(key, astart, i)) return i;
}
return -1;
}
private final int binarySearch(final byte[] key, final int astart) {
// returns the exact position of the key if the key exists,
// or -1 if the key does not exist
assert (rowdef.objectOrder != null);
assert (this.rowdef.objectOrder != null);
int l = 0;
int rbound = this.sortBound;
int p = 0;
@ -302,7 +304,7 @@ public class RowSet extends RowCollection implements Index, Iterable<Row.Entry>
// returns the exact position of the key if the key exists,
// or a position of an entry that is greater than the key if the
// key does not exist
assert (rowdef.objectOrder != null);
assert (this.rowdef.objectOrder != null);
int l = 0;
int rbound = this.sortBound;
int p = 0;
@ -316,118 +318,118 @@ public class RowSet extends RowCollection implements Index, Iterable<Row.Entry>
}
return l;
}
public final synchronized Iterator<byte[]> keys() {
sort();
return super.keys(true);
}
public final synchronized CloneableIterator<byte[]> keys(final boolean up, final byte[] firstKey) {
return new keyIterator(up, firstKey);
}
public final class keyIterator implements CloneableIterator<byte[]> {
private final boolean up;
private final byte[] first;
private int p;
final int bound;
public keyIterator(final boolean up, byte[] firstKey) {
// see that all elements are sorted
sort();
this.up = up;
if (firstKey != null && firstKey.length == 0) firstKey = null;
this.first = firstKey;
this.bound = sortBound;
if (first == null) {
p = 0;
this.bound = RowSet.this.sortBound;
if (this.first == null) {
this.p = 0;
} else {
assert first.length == rowdef.primaryKeyLength : "first.length = " + first.length + ", rowdef.primaryKeyLength = " + rowdef.primaryKeyLength;
p = binaryPosition(first, 0); // check this to find bug in DHT selection enumeration
assert this.first.length == RowSet.this.rowdef.primaryKeyLength : "first.length = " + this.first.length + ", rowdef.primaryKeyLength = " + RowSet.this.rowdef.primaryKeyLength;
this.p = binaryPosition(this.first, 0); // check this to find bug in DHT selection enumeration
}
}
public final keyIterator clone(final Object second) {
return new keyIterator(up, (byte[]) second);
return new keyIterator(this.up, (byte[]) second);
}
public final boolean hasNext() {
if (p < 0) return false;
if (p >= size()) return false;
if (up) {
return p < bound;
if (this.p < 0) return false;
if (this.p >= size()) return false;
if (this.up) {
return this.p < this.bound;
} else {
return p >= 0;
return this.p >= 0;
}
}
public final byte[] next() {
final byte[] key = getKey(p);
if (up) p++; else p--;
final byte[] key = getKey(this.p);
if (this.up) this.p++; else this.p--;
return key;
}
public final void remove() {
throw new UnsupportedOperationException();
}
}
public final synchronized Iterator<Row.Entry> iterator() {
// iterates kelondroRow.Entry - type entries
sort();
return super.iterator();
}
public final synchronized CloneableIterator<Row.Entry> rows(final boolean up, final byte[] firstKey) {
return new rowIterator(up, firstKey);
}
public final synchronized CloneableIterator<Row.Entry> rows() {
return new rowIterator(true, null);
}
public final class rowIterator implements CloneableIterator<Row.Entry> {
private final boolean up;
private final byte[] first;
private int p;
final int bound;
public rowIterator(final boolean up, final byte[] firstKey) {
// see that all elements are sorted
sort();
this.up = up;
this.first = firstKey;
this.bound = sortBound;
if (first == null) {
p = 0;
this.bound = RowSet.this.sortBound;
if (this.first == null) {
this.p = 0;
} else {
assert first.length == rowdef.primaryKeyLength;
p = binaryPosition(first, 0); // check this to find bug in DHT selection enumeration
assert this.first.length == RowSet.this.rowdef.primaryKeyLength;
this.p = binaryPosition(this.first, 0); // check this to find bug in DHT selection enumeration
}
}
public final rowIterator clone(final Object second) {
return new rowIterator(up, (byte[]) second);
return new rowIterator(this.up, (byte[]) second);
}
public final boolean hasNext() {
if (p < 0) return false;
if (p >= size()) return false;
if (up) {
return p < bound;
if (this.p < 0) return false;
if (this.p >= size()) return false;
if (this.up) {
return this.p < this.bound;
} else {
return p >= 0;
return this.p >= 0;
}
}
public final Row.Entry next() {
final Row.Entry entry = get(p, true);
if (up) p++; else p--;
final Row.Entry entry = get(this.p, true);
if (this.up) this.p++; else this.p--;
return entry;
}
public final void remove() {
throw new UnsupportedOperationException();
}
@ -437,35 +439,35 @@ public class RowSet extends RowCollection implements Index, Iterable<Row.Entry>
* merge this row collection with another row collection.
* The resulting collection is sorted and does not contain any doubles, which are also removed during the merge.
* The new collection may be a copy of one of the old one, or can be an alteration of one of the input collections
* After this merge, none of the input collections should be used, because they can be altered
* After this merge, none of the input collections should be used, because they can be altered
* @param c
* @return
* @throws RowSpaceExceededException
* @throws RowSpaceExceededException
*/
public final RowSet merge(final RowSet c) throws RowSpaceExceededException {
assert c != null;
return mergeEnum(this, c);
}
/**
* merge this row collection with another row collection using an simultanous iteration of the input collections
* the current collection is not altered in any way, the returned collection is a new collection with copied content.
* @param c
* @return
* @throws RowSpaceExceededException
* @throws RowSpaceExceededException
*/
protected final static RowSet mergeEnum(final RowCollection c0, final RowCollection c1) throws RowSpaceExceededException {
assert c0.rowdef == c1.rowdef : c0.rowdef.toString() + " != " + c1.rowdef.toString();
final RowSet r = new RowSet(c0.rowdef, c0.size() + c1.size());
try {
c0.sort();
} catch (Exception e) {
} catch (final Exception e) {
Log.logSevere("RowSet", "collection corrupted. cleaned. " + e.getMessage(), e);
c0.clear();
}
try {
c1.sort();
} catch (Exception e) {
} catch (final Exception e) {
Log.logSevere("RowSet", "collection corrupted. cleaned. " + e.getMessage(), e);
c1.clear();
}
@ -506,7 +508,7 @@ public class RowSet extends RowCollection implements Index, Iterable<Row.Entry>
}
return r;
}
public static void main(final String[] args) {
// sort/uniq-test
/*
@ -526,29 +528,29 @@ public class RowSet extends RowCollection implements Index, Iterable<Row.Entry>
rs.uniq(10000);
System.out.println("after uniq, size = " + rs.size());
*/
final String[] test = {
"eins......xxxx",
"zwei......xxxx",
"drei......xxxx",
"vier......xxxx",
"fuenf.....xxxx",
"sechs.....xxxx",
"sieben....xxxx",
"acht......xxxx",
"neun......xxxx",
"eins......xxxx",
"zwei......xxxx",
"drei......xxxx",
"vier......xxxx",
"fuenf.....xxxx",
"sechs.....xxxx",
"sieben....xxxx",
"acht......xxxx",
"neun......xxxx",
"zehn......xxxx" };
final RowSet d = new RowSet(new Row("byte[] key-10, Cardinal x-4 {b256}", NaturalOrder.naturalOrder));
for (int ii = 0; ii < test.length; ii++)
for (final String element : test)
try {
d.add(test[ii].getBytes());
} catch (RowSpaceExceededException e) {
d.add(element.getBytes());
} catch (final RowSpaceExceededException e) {
e.printStackTrace();
}
for (int ii = 0; ii < test.length; ii++)
for (final String element : test)
try {
d.add(test[ii].getBytes());
} catch (RowSpaceExceededException e) {
d.add(element.getBytes());
} catch (final RowSpaceExceededException e) {
e.printStackTrace();
}
d.sort();
@ -571,8 +573,8 @@ public class RowSet extends RowCollection implements Index, Iterable<Row.Entry>
System.out.println("UNIQ : " + d.toString());
d.trim();
System.out.println("TRIM : " + d.toString());
/*
// second test
c = new kelondroRowSet(new kelondroRow(new int[]{10, 3}));
@ -598,7 +600,7 @@ public class RowSet extends RowCollection implements Index, Iterable<Row.Entry>
System.out.println("after uniq: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
System.out.println("RESULT SIZE: " + c.size());
System.out.println();
// third test
c = new kelondroRowSet(new kelondroRow(new int[]{10, 3}), 60000);
c.setOrdering(kelondroNaturalOrder.naturalOrder, 0);
@ -613,7 +615,7 @@ public class RowSet extends RowCollection implements Index, Iterable<Row.Entry>
System.out.println("added " + k + " entries in " +
((t - start) / 1000) + " seconds, " +
(((t - start) > 1000) ? (k / ((t - start) / 1000)) : k) +
" entries/second, " + d + " double, size = " + c.size() +
" entries/second, " + d + " double, size = " + c.size() +
", sum = " + (c.size() + d));
}
System.out.println("RESULT SIZE: " + c.size());
@ -632,7 +634,7 @@ public class RowSet extends RowCollection implements Index, Iterable<Row.Entry>
System.out.println("RESULT SIZE: " + c.size());
System.out.println("Time: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
*/
// remove test
final long start = System.currentTimeMillis();
final RowSet c = new RowSet(new Row("byte[] a-12, byte[] b-12", Base64Order.enhancedCoder));
@ -650,15 +652,17 @@ public class RowSet extends RowCollection implements Index, Iterable<Row.Entry>
key = randomHash(random);
try {
c.put(c.rowdef.newEntry(new byte[][]{key, key}));
} catch (RowSpaceExceededException e) {
} catch (final RowSpaceExceededException e) {
e.printStackTrace();
}
if (i % 1000 == 0) {
for (int j = 0; j < delkeys.length; j++) c.delete(delkeys[j]);
for (final byte[] delkey : delkeys)
c.delete(delkey);
c.sort();
}
}
for (int j = 0; j < delkeys.length; j++) c.delete(delkeys[j]);
for (final byte[] delkey : delkeys)
c.delete(delkey);
c.sort();
random = new Random(0);
for (int i = 0; i < testsize; i++) {
@ -670,7 +674,7 @@ public class RowSet extends RowCollection implements Index, Iterable<Row.Entry>
System.out.println("RESULT SIZE: " + c.size());
System.out.println("Time: " + ((System.currentTimeMillis() - start) / 1000) + " seconds");
}
public static byte[] randomHash(final long r0, final long r1) {
// a long can have 64 bit, but a 12-byte hash can have 6 * 12 = 72 bits
// so we construct a generic Hash using two long values

File diff suppressed because it is too large Load Diff

@ -9,7 +9,7 @@
// $LastChangedBy$
//
// LICENSE
//
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
@ -55,7 +55,6 @@ import net.yacy.document.parser.html.ContentScraper;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.logging.Log;
import net.yacy.kelondro.util.FileUtils;
import de.anomic.crawler.CrawlProfile;
import de.anomic.crawler.retrieval.FTPLoader;
import de.anomic.crawler.retrieval.FileLoader;
@ -71,7 +70,7 @@ public final class LoaderDispatcher {
private static final long minDelay = 250; // milliseconds; 4 accesses per second
private static final ConcurrentHashMap<String, Long> accessTime = new ConcurrentHashMap<String, Long>(); // to protect targets from DDoS
private final Switchboard sb;
private final HashSet<String> supportedProtocols;
private final HTTPLoader httpLoader;
@ -80,25 +79,25 @@ public final class LoaderDispatcher {
private final FileLoader fileLoader;
private final HashMap<String, Semaphore> loaderSteering; // a map that delivers a 'finish' semaphore for urls
private final Log log;
public LoaderDispatcher(final Switchboard sb) {
this.sb = sb;
this.supportedProtocols = new HashSet<String>(Arrays.asList(new String[]{"http","https","ftp","smb","file"}));
// initiate loader objects
this.log = new Log("LOADER");
this.httpLoader = new HTTPLoader(sb, log);
this.ftpLoader = new FTPLoader(sb, log);
this.smbLoader = new SMBLoader(sb, log);
this.fileLoader = new FileLoader(sb, log);
this.httpLoader = new HTTPLoader(sb, this.log);
this.ftpLoader = new FTPLoader(sb, this.log);
this.smbLoader = new SMBLoader(sb, this.log);
this.fileLoader = new FileLoader(sb, this.log);
this.loaderSteering = new HashMap<String, Semaphore>();
}
public boolean isSupportedProtocol(final String protocol) {
if ((protocol == null) || (protocol.length() == 0)) return false;
return this.supportedProtocols.contains(protocol.trim().toLowerCase());
}
@SuppressWarnings("unchecked")
public HashSet<String> getSupportedProtocols() {
return (HashSet<String>) this.supportedProtocols.clone();
@ -117,54 +116,55 @@ public final class LoaderDispatcher {
final boolean global
) {
return new Request(
ASCII.getBytes(sb.peers.mySeed().hash),
url,
null,
"",
ASCII.getBytes(this.sb.peers.mySeed().hash),
url,
null,
"",
new Date(),
(forText) ?
((global) ?
sb.crawler.defaultTextSnippetGlobalProfile.handle() :
sb.crawler.defaultTextSnippetLocalProfile.handle())
this.sb.crawler.defaultTextSnippetGlobalProfile.handle() :
this.sb.crawler.defaultTextSnippetLocalProfile.handle())
:
((global) ?
sb.crawler.defaultMediaSnippetGlobalProfile.handle() :
sb.crawler.defaultMediaSnippetLocalProfile.handle()), // crawl profile
0,
0,
this.sb.crawler.defaultMediaSnippetGlobalProfile.handle() :
this.sb.crawler.defaultMediaSnippetLocalProfile.handle()), // crawl profile
0,
0,
0,
0);
}
public void load(final DigestURI url, CrawlProfile.CacheStrategy cacheStratgy, long maxFileSize, File targetFile) throws IOException {
public void load(final DigestURI url, final CrawlProfile.CacheStrategy cacheStratgy, final long maxFileSize, final File targetFile) throws IOException {
byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, false).getContent();
final byte[] b = load(request(url, false, true), cacheStratgy, maxFileSize, false).getContent();
if (b == null) throw new IOException("load == null");
File tmp = new File(targetFile.getAbsolutePath() + ".tmp");
final File tmp = new File(targetFile.getAbsolutePath() + ".tmp");
// transaction-safe writing
File parent = targetFile.getParentFile();
final File parent = targetFile.getParentFile();
if (!parent.exists()) parent.mkdirs();
FileUtils.copy(b, tmp);
tmp.renameTo(targetFile);
}
public Response load(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize, boolean checkBlacklist) throws IOException {
String url = request.url().toNormalform(true, false);
public Response load(final Request request, final CrawlProfile.CacheStrategy cacheStrategy, final long maxFileSize, final boolean checkBlacklist) throws IOException {
final String url = request.url().toNormalform(true, false);
Semaphore check = this.loaderSteering.get(url);
if (check != null) {
// a loading process may be going on for that url
try { check.tryAcquire(5, TimeUnit.SECONDS);} catch (InterruptedException e) {}
try { check.tryAcquire(5, TimeUnit.SECONDS);} catch (final InterruptedException e) {}
// now the process may have terminated and we run a normal loading
// which may be successful faster because of a cache hit
}
this.loaderSteering.put(url, new Semaphore(0));
try {
Response response = loadInternal(request, cacheStrategy, maxFileSize, checkBlacklist);check = this.loaderSteering.remove(url);
final Response response = loadInternal(request, cacheStrategy, maxFileSize, checkBlacklist);
check = this.loaderSteering.remove(url);
if (check != null) check.release(1000);
return response;
} catch (IOException e) {
} catch (final IOException e) {
// release the semaphore anyway
check = this.loaderSteering.remove(url);
if (check != null) check.release(1000);
@ -172,7 +172,7 @@ public final class LoaderDispatcher {
throw new IOException(e);
}
}
/**
* load a resource from the web, from ftp, from smb or a file
* @param request the request essentials
@ -180,69 +180,69 @@ public final class LoaderDispatcher {
* @return the loaded entity in a Response object
* @throws IOException
*/
private Response loadInternal(final Request request, CrawlProfile.CacheStrategy cacheStrategy, long maxFileSize, boolean checkBlacklist) throws IOException {
private Response loadInternal(final Request request, CrawlProfile.CacheStrategy cacheStrategy, final long maxFileSize, final boolean checkBlacklist) throws IOException {
// get the protocol of the next URL
final DigestURI url = request.url();
if (url.isFile() || url.isSMB()) cacheStrategy = CrawlProfile.CacheStrategy.NOCACHE; // load just from the file system
final String protocol = url.getProtocol();
final String host = url.getHost();
// check if we have the page in the cache
final CrawlProfile crawlProfile = sb.crawler.getActive(UTF8.getBytes(request.profileHandle()));
final CrawlProfile crawlProfile = this.sb.crawler.getActive(UTF8.getBytes(request.profileHandle()));
if (crawlProfile != null && cacheStrategy != CrawlProfile.CacheStrategy.NOCACHE) {
// we have passed a first test if caching is allowed
// now see if there is a cache entry
ResponseHeader cachedResponse = (url.isLocal()) ? null : Cache.getResponseHeader(url.hash());
byte[] content = (cachedResponse == null) ? null : Cache.getContent(url.hash());
final ResponseHeader cachedResponse = (url.isLocal()) ? null : Cache.getResponseHeader(url.hash());
final byte[] content = (cachedResponse == null) ? null : Cache.getContent(url.hash());
if (cachedResponse != null && content != null) {
// yes we have the content
// create request header values and a response object because we need that
// in case that we want to return the cached content in the next step
final RequestHeader requestHeader = new RequestHeader();
requestHeader.put(HeaderFramework.USER_AGENT, ClientIdentification.getUserAgent());
DigestURI refererURL = null;
if (request.referrerhash() != null) refererURL = sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
if (request.referrerhash() != null) refererURL = this.sb.getURL(Segments.Process.LOCALCRAWLING, request.referrerhash());
if (refererURL != null) requestHeader.put(RequestHeader.REFERER, refererURL.toNormalform(true, true));
Response response = new Response(
final Response response = new Response(
request,
requestHeader,
cachedResponse,
"200",
crawlProfile,
content);
// check which caching strategy shall be used
if (cacheStrategy == CrawlProfile.CacheStrategy.IFEXIST || cacheStrategy == CrawlProfile.CacheStrategy.CACHEONLY) {
// well, just take the cache and don't care about freshness of the content
log.logInfo("cache hit/useall for: " + url.toNormalform(true, false));
this.log.logInfo("cache hit/useall for: " + url.toNormalform(true, false));
return response;
}
// now the cacheStrategy must be CACHE_STRATEGY_IFFRESH, that means we should do a proxy freshness test
assert cacheStrategy == CrawlProfile.CacheStrategy.IFFRESH : "cacheStrategy = " + cacheStrategy;
if (response.isFreshForProxy()) {
log.logInfo("cache hit/fresh for: " + url.toNormalform(true, false));
this.log.logInfo("cache hit/fresh for: " + url.toNormalform(true, false));
return response;
} else {
log.logInfo("cache hit/stale for: " + url.toNormalform(true, false));
this.log.logInfo("cache hit/stale for: " + url.toNormalform(true, false));
}
} else if (cachedResponse != null) {
log.logWarning("HTCACHE contained response header, but not content for url " + url.toNormalform(true, false));
this.log.logWarning("HTCACHE contained response header, but not content for url " + url.toNormalform(true, false));
} else if (content != null) {
log.logWarning("HTCACHE contained content, but not response header for url " + url.toNormalform(true, false));
this.log.logWarning("HTCACHE contained content, but not response header for url " + url.toNormalform(true, false));
}
}
// check case where we want results from the cache exclusively, and never from the internet (offline mode)
if (cacheStrategy == CrawlProfile.CacheStrategy.CACHEONLY) {
// we had a chance to get the content from the cache .. its over. We don't have it.
throw new IOException("cache only strategy");
}
// now forget about the cache, nothing there. Try to load the content from the internet
// check access time: this is a double-check (we checked possibly already in the balancer)
// to make sure that we don't DoS the target by mistake
if (!url.isLocal()) {
@ -260,13 +260,13 @@ public final class LoaderDispatcher {
// now it's for sure that we will access the target. Remember the access time
if (host != null) accessTime.put(host, System.currentTimeMillis());
// load resource from the internet
Response response = null;
if ((protocol.equals("http") || (protocol.equals("https")))) response = httpLoader.load(request, maxFileSize, checkBlacklist);
if (protocol.equals("ftp")) response = ftpLoader.load(request, true);
if (protocol.equals("smb")) response = smbLoader.load(request, true);
if (protocol.equals("file")) response = fileLoader.load(request, true);
if ((protocol.equals("http") || (protocol.equals("https")))) response = this.httpLoader.load(request, maxFileSize, checkBlacklist);
if (protocol.equals("ftp")) response = this.ftpLoader.load(request, true);
if (protocol.equals("smb")) response = this.smbLoader.load(request, true);
if (protocol.equals("file")) response = this.fileLoader.load(request, true);
if (response != null && response.getContent() != null) {
// we got something. Now check if we want to store that to the cache
// first check looks if we want to store the content to the cache
@ -275,19 +275,19 @@ public final class LoaderDispatcher {
return response;
}
// second check tells us if the protocoll tells us something about caching
String storeError = response.shallStoreCacheForCrawler();
final String storeError = response.shallStoreCacheForCrawler();
if (storeError == null) {
try {
Cache.store(url, response.getResponseHeader(), response.getContent());
} catch (IOException e) {
log.logWarning("cannot write " + response.url() + " to Cache (3): " + e.getMessage(), e);
} catch (final IOException e) {
this.log.logWarning("cannot write " + response.url() + " to Cache (3): " + e.getMessage(), e);
}
} else {
log.logWarning("cannot write " + response.url() + " to Cache (4): " + storeError);
this.log.logWarning("cannot write " + response.url() + " to Cache (4): " + storeError);
}
return response;
}
throw new IOException("Unsupported protocol '" + protocol + "' in url " + url);
}
@ -297,19 +297,19 @@ public final class LoaderDispatcher {
* @param cacheStrategy
* @param timeout
* @return the content as {@link byte[]}
* @throws IOException
* @throws IOException
*/
public byte[] loadContent(final Request request, CrawlProfile.CacheStrategy cacheStrategy) throws IOException {
public byte[] loadContent(final Request request, final CrawlProfile.CacheStrategy cacheStrategy) throws IOException {
// try to download the resource using the loader
final long maxFileSize = sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
final long maxFileSize = this.sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
final Response entry = load(request, cacheStrategy, maxFileSize, false);
if (entry == null) return null; // not found in web
// read resource body (if it is there)
return entry.getContent();
}
public Document[] loadDocuments(final Request request, final CrawlProfile.CacheStrategy cacheStrategy, final int timeout, long maxFileSize) throws IOException, Parser.Failure {
public Document[] loadDocuments(final Request request, final CrawlProfile.CacheStrategy cacheStrategy, final int timeout, final long maxFileSize) throws IOException, Parser.Failure {
// load resource
final Response response = load(request, cacheStrategy, maxFileSize, false);
@ -323,16 +323,16 @@ public final class LoaderDispatcher {
return response.parse();
}
public ContentScraper parseResource(final DigestURI location, CrawlProfile.CacheStrategy cachePolicy) throws IOException {
public ContentScraper parseResource(final DigestURI location, final CrawlProfile.CacheStrategy cachePolicy) throws IOException {
// load page
final long maxFileSize = this.sb.getConfigLong("crawler.http.maxFileSize", HTTPLoader.DEFAULT_MAXFILESIZE);
Response r = this.load(request(location, true, false), cachePolicy, maxFileSize, false);
byte[] page = (r == null) ? null : r.getContent();
final Response r = this.load(request(location, true, false), cachePolicy, maxFileSize, false);
final byte[] page = (r == null) ? null : r.getContent();
if (page == null) throw new IOException("no response from url " + location.toString());
try {
return htmlParser.parseToScraper(location, r.getCharacterEncoding(), new ByteArrayInputStream(page));
} catch(Parser.Failure e) {
} catch(final Parser.Failure e) {
throw new IOException(e.getMessage());
}
}
@ -344,16 +344,16 @@ public final class LoaderDispatcher {
* @return a map from URLs to the anchor texts of the urls
* @throws IOException
*/
public final Map<MultiProtocolURI, String> loadLinks(DigestURI url, CrawlProfile.CacheStrategy cacheStrategy) throws IOException {
Response response = load(request(url, true, false), cacheStrategy, Long.MAX_VALUE, false);
public final Map<MultiProtocolURI, String> loadLinks(final DigestURI url, final CrawlProfile.CacheStrategy cacheStrategy) throws IOException {
final Response response = load(request(url, true, false), cacheStrategy, Long.MAX_VALUE, false);
if (response == null) throw new IOException("response == null");
ResponseHeader responseHeader = response.getResponseHeader();
final ResponseHeader responseHeader = response.getResponseHeader();
byte[] resource = response.getContent();
if (resource == null) throw new IOException("resource == null");
if (responseHeader == null) throw new IOException("responseHeader == null");
Document[] documents = null;
String supportError = TextParser.supports(url, responseHeader.mime());
final String supportError = TextParser.supports(url, responseHeader.mime());
if (supportError != null) throw new IOException("no parser support: " + supportError);
try {
documents = TextParser.parseSource(url, responseHeader.mime(), responseHeader.getCharacterEncoding(), resource.length, new ByteArrayInputStream(resource));
@ -366,8 +366,8 @@ public final class LoaderDispatcher {
return Document.getHyperlinks(documents);
}
public synchronized void cleanupAccessTimeTable(long timeout) {
public synchronized void cleanupAccessTimeTable(final long timeout) {
final Iterator<Map.Entry<String, Long>> i = accessTime.entrySet().iterator();
Map.Entry<String, Long> e;
while (i.hasNext()) {
@ -376,37 +376,37 @@ public final class LoaderDispatcher {
if (System.currentTimeMillis() - e.getValue().longValue() > minDelay) i.remove();
}
}
public void loadIfNotExistBackground(String url, File cache, long maxFileSize) {
public void loadIfNotExistBackground(final String url, final File cache, final long maxFileSize) {
new Loader(url, cache, maxFileSize, CrawlProfile.CacheStrategy.IFEXIST).start();
}
public void loadIfNotExistBackground(String url, long maxFileSize) {
public void loadIfNotExistBackground(final String url, final long maxFileSize) {
new Loader(url, null, maxFileSize, CrawlProfile.CacheStrategy.IFEXIST).start();
}
private class Loader extends Thread {
private String url;
private File cache;
private long maxFileSize;
private CrawlProfile.CacheStrategy cacheStrategy;
public Loader(String url, File cache, long maxFileSize, CrawlProfile.CacheStrategy cacheStrategy) {
private final String url;
private final File cache;
private final long maxFileSize;
private final CrawlProfile.CacheStrategy cacheStrategy;
public Loader(final String url, final File cache, final long maxFileSize, final CrawlProfile.CacheStrategy cacheStrategy) {
this.url = url;
this.cache = cache;
this.maxFileSize = maxFileSize;
this.cacheStrategy = cacheStrategy;
}
public void run() {
if (this.cache != null && this.cache.exists()) return;
try {
// load from the net
Response response = load(request(new DigestURI(this.url), false, true), this.cacheStrategy, this.maxFileSize, true);
byte[] b = response.getContent();
final Response response = load(request(new DigestURI(this.url), false, true), this.cacheStrategy, this.maxFileSize, true);
final byte[] b = response.getContent();
if (this.cache != null) FileUtils.copy(b, this.cache);
} catch (MalformedURLException e) {} catch (IOException e) {}
} catch (final MalformedURLException e) {} catch (final IOException e) {}
}
}
}
Loading…
Cancel
Save