diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java
index b8191d340..15c24e20a 100644
--- a/htroot/IndexControl_p.java
+++ b/htroot/IndexControl_p.java
@@ -300,7 +300,7 @@ public class IndexControl_p {
"true".equalsIgnoreCase(gzipBody),
timeout);
result = (String) resultObj.get("result");
- prop.put("result", (result == null) ? ("Successfully transferred " + index.size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result);
+ prop.put("result", (result == null) ? ("Successfully transferred " + knownURLs.size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds, " + unknownURLEntries + " URL not found") : result);
index = null;
}
diff --git a/htroot/yacy/transferRWI.java b/htroot/yacy/transferRWI.java
index f9152a133..9de800a11 100644
--- a/htroot/yacy/transferRWI.java
+++ b/htroot/yacy/transferRWI.java
@@ -203,7 +203,7 @@ public final class transferRWI {
}
if (unknownURLs.length() > 0) { unknownURLs.delete(0, 1); }
if ((wordhashes.length == 0) || (received == 0)) {
- sb.getLog().logInfo("Received 0 RWIs from " + otherPeerName + ", processed in " + (System.currentTimeMillis() - startProcess) + " milliseconds, requesting " + unknownURL.size() + " URLs");
+ sb.getLog().logInfo("Received 0 RWIs from " + otherPeerName + ", processed in " + (System.currentTimeMillis() - startProcess) + " milliseconds, requesting " + unknownURL.size() + " URLs, blocked " + blocked + " RWIs");
} else {
final double avdist = (yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, wordhashes[0]) + yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, wordhashes[received - 1])) / 2.0;
sb.getLog().logInfo("Received " + received + " Entries " + wordc + " Words [" + wordhashes[0] + " .. " + wordhashes[received - 1] + "]/" + avdist + " from " + otherPeerName + ", processed in " + (System.currentTimeMillis() - startProcess) + " milliseconds, requesting " + unknownURL.size() + "/" + receivedURL + " URLs, blocked " + blocked + " RWIs");
diff --git a/htroot/yacy/transferURL.java b/htroot/yacy/transferURL.java
index bfcdbce05..36d3209f4 100644
--- a/htroot/yacy/transferURL.java
+++ b/htroot/yacy/transferURL.java
@@ -46,10 +46,12 @@
// javac -classpath .:../classes transferRWI.java
import java.io.IOException;
+import java.text.ParseException;
import de.anomic.http.httpHeader;
import de.anomic.index.indexURLEntry;
import de.anomic.plasma.plasmaSwitchboard;
+import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
@@ -59,11 +61,14 @@ import de.anomic.yacy.yacySeed;
public final class transferURL {
+
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch ss) throws InterruptedException {
if (post == null || ss == null) { return null; }
long start = System.currentTimeMillis();
-
+ long freshdate = 0;
+ try {freshdate = plasmaURL.shortDayFormatter.parse("20061101").getTime();} catch (ParseException e1) {}
+
// return variable that accumulates replacements
final plasmaSwitchboard sb = (plasmaSwitchboard) ss;
final serverObjects prop = new serverObjects();
@@ -93,35 +98,45 @@ public final class transferURL {
indexURLEntry lEntry;
for (int i = 0; i < urlc; i++) {
serverCore.checkInterruption();
+
+ // read new lurl-entry
urls = (String) post.get("url" + i);
if (urls == null) {
yacyCore.log.logFine("transferURL: got null URL-string from peer " + otherPeerName);
- } else {
- lEntry = sb.wordIndex.loadedURL.newEntry(urls);
- if (lEntry == null) {
- yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
- // TODO: should we send back an error message???
- } else {
- indexURLEntry.Components comp = lEntry.comp();
- if (comp.url() == null) {
- yacyCore.log.logWarning("transferURL: received invalid URL (url null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
- // TODO: should we send back an error message???
- } else {
- if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, lEntry.hash(), comp.url()))) {
- int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash());
- yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs");
- lEntry = null;
- blocked++;
- } else try {
- sb.wordIndex.loadedURL.store(lEntry);
- sb.wordIndex.loadedURL.stack(lEntry, iam, iam, 3);
- yacyCore.log.logFine("transferURL: received URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName);
- received++;
- } catch (IOException e) {
- e.printStackTrace();
- }
- }
- }
+ continue;
+ }
+
+ // parse new lurl-entry
+ lEntry = sb.wordIndex.loadedURL.newEntry(urls);
+ if (lEntry == null) {
+ yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
+ continue;
+ }
+
+ // check if entry is well-formed
+ indexURLEntry.Components comp = lEntry.comp();
+ if ((comp.url() == null) || (lEntry.freshdate().getTime() <= freshdate)) {
+ yacyCore.log.logWarning("transferURL: received invalid URL from peer " + otherPeerName + "\n\tURL Property: " + urls);
+ continue;
+ }
+
+ // check if the entry is blacklisted
+ if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, lEntry.hash(), comp.url()))) {
+ int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash());
+ yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs");
+ lEntry = null;
+ blocked++;
+ continue;
+ }
+
+ // write entry to database
+ try {
+ sb.wordIndex.loadedURL.store(lEntry);
+ sb.wordIndex.loadedURL.stack(lEntry, iam, iam, 3);
+ yacyCore.log.logFine("transferURL: received URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName);
+ received++;
+ } catch (IOException e) {
+ e.printStackTrace();
}
}
diff --git a/source/de/anomic/index/indexRWIEntryNew.java b/source/de/anomic/index/indexRWIEntryNew.java
index d9eae2d2f..5894b242c 100644
--- a/source/de/anomic/index/indexRWIEntryNew.java
+++ b/source/de/anomic/index/indexRWIEntryNew.java
@@ -117,7 +117,6 @@ public class indexRWIEntryNew implements Cloneable, indexRWIEntry {
int sizeOfPage, // # of bytes of the page TODO: not needed any more
long lastmodified, // last-modified time of the document where word appears
long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
- int quality, // the entropy value
String language, // (guessed) language of document
char doctype, // type of document
int outlinksSame, // outlinks to same domain
diff --git a/source/de/anomic/index/indexRWIEntryOld.java b/source/de/anomic/index/indexRWIEntryOld.java
index 25f3a93a8..268f9dec2 100644
--- a/source/de/anomic/index/indexRWIEntryOld.java
+++ b/source/de/anomic/index/indexRWIEntryOld.java
@@ -33,7 +33,6 @@ import de.anomic.kelondro.kelondroColumn;
import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRow.Entry;
import de.anomic.plasma.plasmaSearchQuery;
-import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.plasmaWordIndex;
import de.anomic.yacy.yacySeedDB;
@@ -66,7 +65,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
private static final int col_hitcount = 3;
private static final int col_language = 4;
private static final int col_doctype = 5;
- private static final int col_localflag = 6;
+ //private static final int col_localflag = 6;
private static final int col_posintext = 7;
private static final int col_posinphrase = 8;
private static final int col_posofphrase = 9;
@@ -77,6 +76,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
private kelondroRow.Entry entry;
+ /*
public indexRWIEntryOld(String urlHash,
int urlLength, // byte-length of complete URL
int urlComps, // number of path components
@@ -91,7 +91,6 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
int sizeOfPage, // # of bytes of the page
long lastmodified, //*last-modified time of the document where word appears
long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
- int quality, //*the entropy value
String language, //*(guessed) language of document
char doctype, //*type of document
int outlinksSame, // outlinks to same domain
@@ -107,7 +106,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
if ((language == null) || (language.length() != urlEntryRow.width(col_language))) language = "uk";
this.entry = urlEntryRow.newEntry();
this.entry.setCol(col_urlhash, urlHash, null);
- this.entry.setCol(col_quality, quality);
+ this.entry.setCol(col_quality, 0);
this.entry.setCol(col_lastModified, lastmodified);
this.entry.setCol(col_hitcount, hitcount);
this.entry.setCol(col_language, language, null);
@@ -121,7 +120,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
this.entry.setCol(col_phrasecount, phrasecount);
//System.out.println("DEBUG-NEWENTRY " + toPropertyForm());
}
-
+*/
public indexRWIEntryOld(String urlHash, String code) {
// the code is the external form of the row minus the leading urlHash entry
this.entry = urlEntryRow.newEntry((urlHash + code).getBytes());
diff --git a/source/de/anomic/kelondro/kelondroBitfield.java b/source/de/anomic/kelondro/kelondroBitfield.java
index c7560ea8b..ce1042987 100644
--- a/source/de/anomic/kelondro/kelondroBitfield.java
+++ b/source/de/anomic/kelondro/kelondroBitfield.java
@@ -24,7 +24,7 @@
package de.anomic.kelondro;
-public class kelondroBitfield {
+public class kelondroBitfield implements Cloneable {
// the bitfield implements a binary array. Such arrays may be exported in a base64-String
@@ -55,6 +55,12 @@ public class kelondroBitfield {
}
}
+ public Object clone() {
+ kelondroBitfield theClone = new kelondroBitfield(new byte[this.bb.length]);
+ System.arraycopy(this.bb, 0, theClone.bb, 0, this.bb.length);
+ return theClone;
+ }
+
public void set(int pos, boolean value) {
assert (pos >= 0);
int slot = pos / 8;
diff --git a/source/de/anomic/kelondro/kelondroRecords.java b/source/de/anomic/kelondro/kelondroRecords.java
index e573f7532..8644b5c4d 100644
--- a/source/de/anomic/kelondro/kelondroRecords.java
+++ b/source/de/anomic/kelondro/kelondroRecords.java
@@ -1392,7 +1392,7 @@ public class kelondroRecords {
USAGE.FREEC--;
// take link
if (USAGE.FREEH.index == NUL) {
- System.out.println("INTERNAL ERROR (DATA INCONSISTENCY): re-use of records failed, lost " + (USAGE.FREEC + 1) + " records. Affected file: " + filename);
+ serverLog.logSevere("kelondroRecords/" + filename, "INTERNAL ERROR (DATA INCONSISTENCY): re-use of records failed, lost " + (USAGE.FREEC + 1) + " records.");
// try to heal..
USAGE.USEDC = USAGE.allCount() + 1;
USAGE.FREEC = 0;
@@ -1402,10 +1402,17 @@ public class kelondroRecords {
//System.out.println("*DEBUG* ALLOCATED DELETED INDEX " + index);
// check for valid seek position
long seekp = seekpos(USAGE.FREEH);
- if (seekp > entryFile.length()) throw new kelondroException("new Handle: seek position " + seekp + "/" + USAGE.FREEH.index + " out of file size " + entryFile.length() + "/" + ((entryFile.length() - POS_NODES) / recordsize));
-
- // read link to next element of FREEH chain
- USAGE.FREEH.index = entryFile.readInt(seekp);
+ if (seekp > entryFile.length()) {
+ // this is a severe inconsistency. try to heal..
+ serverLog.logSevere("kelondroRecords/" + filename, "new Handle: lost " + USAGE.FREEC + " marked nodes; seek position " + seekp + "/" + USAGE.FREEH.index + " out of file size " + entryFile.length() + "/" + ((entryFile.length() - POS_NODES) / recordsize));
+ index = USAGE.allCount(); // a place at the end of the file
+ USAGE.USEDC += USAGE.FREEC; // to avoid that non-empty records at the end are overwritten
+ USAGE.FREEC = 0; // discard all possible empty nodes
+ USAGE.FREEH.index = NUL;
+ } else {
+ // read link to next element of FREEH chain
+ USAGE.FREEH.index = entryFile.readInt(seekp);
+ }
}
USAGE.write();
}
diff --git a/source/de/anomic/plasma/plasmaCondenser.java b/source/de/anomic/plasma/plasmaCondenser.java
index 76d57dfe8..6fe6298f7 100644
--- a/source/de/anomic/plasma/plasmaCondenser.java
+++ b/source/de/anomic/plasma/plasmaCondenser.java
@@ -49,7 +49,6 @@ import java.io.ByteArrayInputStream;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
-import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
@@ -66,6 +65,8 @@ import java.util.TreeMap;
import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterContentScraper;
+import de.anomic.htmlFilter.htmlFilterImageEntry;
+import de.anomic.index.indexRWIEntryNew;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroMSetTools;
@@ -114,20 +115,123 @@ public final class plasmaCondenser {
public int RESULT_NUMB_WORDS = -1;
public int RESULT_DIFF_WORDS = -1;
public int RESULT_SIMI_WORDS = -1;
- public int RESULT_WORD_ENTROPHY = -1;
public int RESULT_NUMB_SENTENCES = -1;
public int RESULT_DIFF_SENTENCES = -1;
public int RESULT_SIMI_SENTENCES = -1;
public kelondroBitfield RESULT_FLAGS = new kelondroBitfield(4);
- public plasmaCondenser(plasmaParserDocument document) throws UnsupportedEncodingException {
+ public plasmaCondenser(plasmaParserDocument document, boolean addMedia) throws UnsupportedEncodingException {
+ // if addMedia == true, then all the media links are also parsed and added to the words
+ // added media words are flagged with the approriate media flag
this(document.getText(), document.getCharset());
+
+ kelondroBitfield wflags = (kelondroBitfield) RESULT_FLAGS.clone(); // the template for the word flags, only from position 0..19
+ // construct flag set for document
if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true);
if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true);
if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true);
if (document.getApplinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasapp, true);
+
+ // the phrase counter:
+ // phrase 0 are words taken from the URL
+ // phrase 1 is the MainLongTitle
+ // phrase 2 is the MainShortTitle
+ // phrase 3 is the Document Abstract
+ // phrase 4 is the Document Author
+ // phrase 5 are the tags specified in document
+ // phrase 10 and above are the section headlines/titles (88 possible)
+ // phrase 98 is taken from the embedded anchor/hyperlinks description
+ // phrase 99 is taken from the media Link url and anchor description
+ // phrase 100 and above are lines from the text
+
+ insertTextToWords(document.getMainLongTitle(), 1, indexRWIEntryNew.flag_app_descr, wflags);
+ insertTextToWords(document.getMainShortTitle(), 2, indexRWIEntryNew.flag_app_descr, wflags);
+ insertTextToWords(document.getAbstract(), 3, indexRWIEntryNew.flag_app_descr, wflags);
+ // missing: author!
+ // missing: tags!
+ String[] titles = document.getSectionTitles();
+ for (int i = 0; 1 < titles.length; i++) {
+ insertTextToWords(titles[i], i + 10, indexRWIEntryNew.flag_app_emphasized, wflags);
+ }
+
+ // anchors
+ Iterator i = document.getAnchors().entrySet().iterator();
+ Map.Entry entry;
+ while (i.hasNext()) {
+ entry = (Map.Entry) i.next();
+ insertTextToWords((String) entry.getKey(), 98, indexRWIEntryNew.flag_app_url, wflags);
+ insertTextToWords((String) entry.getValue(), 98, indexRWIEntryNew.flag_app_url, wflags);
+ }
+
+ // audio
+ i = document.getAudiolinks().entrySet().iterator();
+ while (i.hasNext()) {
+ entry = (Map.Entry) i.next();
+ insertTextToWords((String) entry.getKey(), 99, flag_cat_hasaudio, wflags);
+ insertTextToWords((String) entry.getValue(), 99, flag_cat_hasaudio, wflags);
+ }
+
+ // video
+ i = document.getVideolinks().entrySet().iterator();
+ while (i.hasNext()) {
+ entry = (Map.Entry) i.next();
+ insertTextToWords((String) entry.getKey(), 99, flag_cat_hasvideo, wflags);
+ insertTextToWords((String) entry.getValue(), 99, flag_cat_hasvideo, wflags);
+ }
+
+ // applications
+ i = document.getApplinks().entrySet().iterator();
+ while (i.hasNext()) {
+ entry = (Map.Entry) i.next();
+ insertTextToWords((String) entry.getKey(), 99, flag_cat_hasapp, wflags);
+ insertTextToWords((String) entry.getValue(), 99, flag_cat_hasapp, wflags);
+ }
+
+ // images
+ i = document.getImages().iterator();
+ htmlFilterImageEntry ientry;
+ while (i.hasNext()) {
+ ientry = (htmlFilterImageEntry) i.next();
+ insertTextToWords((String) ientry.url().toNormalform(), 99, flag_cat_hasimage, wflags);
+ insertTextToWords((String) ientry.alt(), 99, flag_cat_hasimage, wflags);
+ }
+
+ // finally check all words for missing flag entry
+ i = words.entrySet().iterator();
+ wordStatProp wprop;
+ while (i.hasNext()) {
+ entry = (Map.Entry) i.next();
+ wprop = (wordStatProp) entry.getValue();
+ if (wprop.flags == null) {
+ wprop.flags = (kelondroBitfield) wflags.clone();
+ words.put(entry.getKey(), wprop);
+ }
+ }
}
+ private void insertTextToWords(String text, int phrase, int flagpos, kelondroBitfield flagstemplate) {
+ String word;
+ wordStatProp wprop;
+ sievedWordsEnum wordenum;
+ try {
+ wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes()), "UTF-8", 3);
+ } catch (UnsupportedEncodingException e) {
+ return;
+ }
+ int pip = 0;
+ while (wordenum.hasMoreElements()) {
+ word = ((String) wordenum.nextElement()).toLowerCase();
+ wprop = (wordStatProp) words.get(word);
+ if (wprop == null) wprop = new wordStatProp(0, pip, phrase);
+ if (wprop.flags == null) wprop.flags = (kelondroBitfield) flagstemplate.clone();
+ wprop.numOfPhrase = 1;
+ wprop.posInPhrase = pip;
+ wprop.flags.set(flagpos, true);
+ words.put(word, wprop);
+ pip++;
+ }
+ }
+
public plasmaCondenser(InputStream text, String charset) throws UnsupportedEncodingException {
this(text, charset, 3, 2);
}
@@ -174,18 +278,19 @@ public final class plasmaCondenser {
}
public Map words() {
- // returns the words as wod/wordStatProp relation map
+ // returns the words as word/wordStatProp relation map
return words;
}
public static class wordStatProp {
// object carries statistics for words and sentences
- public int count; // number of occurrences
- public int posInText; // unique handle, is initialized with word position (excluding double occurring words)
- public int posInPhrase; //
- public int numOfPhrase;
- public HashSet hash; //
+ public int count; // number of occurrences
+ public int posInText; // unique handle, is initialized with word position (excluding double occurring words)
+ public int posInPhrase; // position of word in phrase
+ public int numOfPhrase; // number of phrase. 'normal' phrases begin with number 100
+ public HashSet hash; // a set of handles to all sentences where this word appears
+ public kelondroBitfield flags; // the flag bits for each word
public wordStatProp(int handle, int pip, int nop) {
this.count = 1;
@@ -193,6 +298,7 @@ public final class plasmaCondenser {
this.posInPhrase = pip;
this.numOfPhrase = nop;
this.hash = new HashSet();
+ this.flags = null;
}
public void inc() {
@@ -314,7 +420,7 @@ public final class plasmaCondenser {
} else {
// word does not yet exist, create new word entry
wordHandle = wordHandleCount++;
- wsp = new wordStatProp(wordHandle, wordInSentenceCounter, sentences.size() + 1);
+ wsp = new wordStatProp(wordHandle, wordInSentenceCounter, sentences.size() + 100);
}
words.put(word, wsp);
// we now have the unique handle of the word, put it into the sentence:
@@ -429,7 +535,6 @@ public final class plasmaCondenser {
this.RESULT_NUMB_WORDS = allwordcounter;
this.RESULT_DIFF_WORDS = wordHandleCount;
this.RESULT_SIMI_WORDS = words.size();
- this.RESULT_WORD_ENTROPHY = (allwordcounter == 0) ? 0 : (255 * words.size() / allwordcounter);
this.RESULT_NUMB_SENTENCES = allsentencecounter;
this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
this.RESULT_SIMI_SENTENCES = sentences.size();
@@ -508,6 +613,7 @@ public final class plasmaCondenser {
return orderedSentences;
}
+ /*
public void writeMapToFile(File out) throws IOException {
Map.Entry entry;
String k;
@@ -520,7 +626,7 @@ public final class plasmaCondenser {
// we reconstruct the word hashtable
// and sort the entries by the number of occurrences
// this structure is needed to print out a sorted list of words
- TreeMap sortedWords = new TreeMap(/*kelondroNaturalOrder.naturalOrder*/);
+ TreeMap sortedWords = new TreeMap(); //kelondroNaturalOrder.naturalOrder
it = words.entrySet().iterator(); // enumerates the keys in ascending order
while (it.hasNext()) {
entry = (Map.Entry) it.next();
@@ -549,7 +655,7 @@ public final class plasmaCondenser {
}
writer.close();
}
-
+*/
public final static boolean invisible(char c) {
// TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars?
if ((c < ' ') || (c > 'z')) return true;
@@ -771,16 +877,22 @@ public final class plasmaCondenser {
}
- public static Map getWords(InputStream input, String charset) throws UnsupportedEncodingException {
- if (input == null) return null;
- plasmaCondenser condenser = new plasmaCondenser(input, charset);
- return condenser.words;
- }
-
public static Map getWords(byte[] text, String charset) throws UnsupportedEncodingException {
+ // returns a word/wordStatProp relation map
if (text == null) return null;
ByteArrayInputStream buffer = new ByteArrayInputStream(text);
- return getWords(buffer, charset);
+ return new plasmaCondenser(buffer, charset, 2, 1).words();
+ }
+
+ public static Map getWords(String text) {
+ // returns a word/wordStatProp relation map
+ if (text == null) return null;
+ ByteArrayInputStream buffer = new ByteArrayInputStream(text.getBytes());
+ try {
+ return new plasmaCondenser(buffer, "UTF-8", 2, 1).words();
+ } catch (UnsupportedEncodingException e) {
+ return null;
+ }
}
public static void main(String[] args) {
diff --git a/source/de/anomic/plasma/plasmaParserDocument.java b/source/de/anomic/plasma/plasmaParserDocument.java
index 3194f3360..532b7417c 100644
--- a/source/de/anomic/plasma/plasmaParserDocument.java
+++ b/source/de/anomic/plasma/plasmaParserDocument.java
@@ -220,6 +220,7 @@ public class plasmaParserDocument {
public Map getAnchors() {
// returns all links embedded as anchors (clickeable entities)
+ // this is a url(String)/text(String) map
return anchors;
}
diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java
index c0c9874c7..7028655bf 100644
--- a/source/de/anomic/plasma/plasmaSnippetCache.java
+++ b/source/de/anomic/plasma/plasmaSnippetCache.java
@@ -52,6 +52,7 @@ import java.util.Enumeration;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
+import java.util.Map;
import java.util.Set;
import de.anomic.http.httpHeader;
@@ -255,27 +256,38 @@ public class plasmaSnippetCache {
try { resContent.close(); } catch (Exception e) {/* ignore this */}
}
if (document == null) return new Snippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
-
- //System.out.println("loaded document for URL " + url);
- final Enumeration sentences = document.getSentences(pre);
- document.close();
- //System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
- if (sentences == null) {
- //System.out.println("found no sentences in url " + url);
- return new Snippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences");
- }
-
+
+
/* ===========================================================================
* COMPUTE SNIPPET
* =========================================================================== */
// we have found a parseable non-empty file: use the lines
- line = computeSnippet(sentences, queryhashes, 3 * queryhashes.size(), snippetMaxLength);
- //System.out.println("loaded snippet for URL " + url + ": " + line);
+
+ // compute snippet from text
+ final Enumeration sentences = document.getSentences(pre);
+ if (sentences == null) return new Snippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences");
+ String textline = computeTextSnippet(sentences, queryhashes, 3 * queryhashes.size(), snippetMaxLength);
+
+ // compute snippet from media
+ String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
+ String videoline = computeMediaSnippet(document.getVideolinks(), queryhashes);
+ String appline = computeMediaSnippet(document.getApplinks(), queryhashes);
+ //String hrefline = computeMediaSnippet(document.getAnchors(), queryhashes);
+ //String imageline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
+
+ line = "";
+ if (audioline != null) line += (line.length() == 0) ? audioline : "
" + audioline;
+ if (videoline != null) line += (line.length() == 0) ? videoline : "
" + videoline;
+ if (appline != null) line += (line.length() == 0) ? appline : "
" + appline;
+ //if (hrefline != null) line += (line.length() == 0) ? hrefline : "
" + hrefline;
+ if (textline != null) line += (line.length() == 0) ? textline : "
" + textline;
+
if (line == null) return new Snippet(null, ERROR_NO_MATCH, "no matching snippet found");
if (line.length() > snippetMaxLength) line = line.substring(0, snippetMaxLength);
// finally store this snippet in our own cache
storeToCache(wordhashes, urlhash, line);
+ document.close();
return new Snippet(line, source, null);
}
@@ -366,7 +378,32 @@ public class plasmaSnippetCache {
return (String) snippetsCache.get(key);
}
- private String computeSnippet(Enumeration sentences, Set queryhashes, int minLength, int maxLength) {
+ private String computeMediaSnippet(Map media, Set queryhashes) {
+ Iterator i = media.entrySet().iterator();
+ Map.Entry entry;
+ String url, desc;
+ Set s;
+ String result = "";
+ while (i.hasNext()) {
+ entry = (Map.Entry) i.next();
+ url = (String) entry.getKey();
+ desc = (String) entry.getValue();
+ s = removeAppearanceHashes(url, queryhashes);
+ if (s.size() == 0) {
+ result += "
" + ((desc.length() == 0) ? url : desc) + "";
+ continue;
+ }
+ s = removeAppearanceHashes(desc, s);
+ if (s.size() == 0) {
+ result += "
" + ((desc.length() == 0) ? url : desc) + "";
+ continue;
+ }
+ }
+ if (result.length() == 0) return null;
+ return result.substring(6);
+ }
+
+ private String computeTextSnippet(Enumeration sentences, Set queryhashes, int minLength, int maxLength) {
try {
if (sentences == null) return null;
if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
@@ -404,20 +441,43 @@ public class plasmaSnippetCache {
shortLineLength = ((String) sb.get(i)).length();
}
}
+
// find a first result
- String result = (String) sb.get(shortLineIndex);
- // remove all hashes that appear in the result
- hs = hashSentence(result);
+ String result = computeTextSnippet((String) sb.get(shortLineIndex), queryhashes, minLength, maxLength);
+ Set remaininghashes = removeAppearanceHashes(result, queryhashes);
+
+ if (remaininghashes.size() == 0) return result;
+ // the result has not all words in it.
+ // find another sentence that represents the missing other words
+ // and find recursively more sentences
+ maxLength = maxLength - result.length();
+ if (maxLength < 20) maxLength = 20;
+ String nextSnippet = computeTextSnippet(sentences, remaininghashes, minLength, maxLength);
+ if (nextSnippet == null) return null;
+ return result + (" / " + nextSnippet);
+ } catch (IndexOutOfBoundsException e) {
+ log.logSevere("computeSnippet: error with string generation", e);
+ return "";
+ }
+ }
+
+ private String computeTextSnippet(String sentence, Set queryhashes, int minLength, int maxLength) {
+ try {
+ if (sentence == null) return null;
+ if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
+ Iterator j;
+ HashMap hs;
+ String hash;
+
+ // find all hashes that appear in the sentence
+ hs = hashSentence(sentence);
j = queryhashes.iterator();
Integer pos;
- Set remaininghashes = new HashSet();
- int p, minpos = result.length(), maxpos = -1;
+ int p, minpos = sentence.length(), maxpos = -1;
while (j.hasNext()) {
hash = (String) j.next();
pos = (Integer) hs.get(hash);
- if (pos == null) {
- remaininghashes.add(new String(hash));
- } else {
+ if (pos != null) {
p = pos.intValue();
if (p > maxpos) maxpos = p;
if (p < minpos) minpos = p;
@@ -425,51 +485,62 @@ public class plasmaSnippetCache {
}
// check result size
maxpos = maxpos + 10;
- if (maxpos > result.length()) maxpos = result.length();
+ if (maxpos > sentence.length()) maxpos = sentence.length();
if (minpos < 0) minpos = 0;
// we have a result, but is it short enough?
if (maxpos - minpos + 10 > maxLength) {
// the string is too long, even if we cut at both ends
// so cut here in the middle of the string
- int lenb = result.length();
- result = result.substring(0, (minpos + 20 > result.length()) ? result.length() : minpos + 20).trim() +
+ int lenb = sentence.length();
+ sentence = sentence.substring(0, (minpos + 20 > sentence.length()) ? sentence.length() : minpos + 20).trim() +
" [..] " +
- result.substring((maxpos + 26 > result.length()) ? result.length() : maxpos + 26).trim();
- maxpos = maxpos + lenb - result.length() + 6;
+ sentence.substring((maxpos + 26 > sentence.length()) ? sentence.length() : maxpos + 26).trim();
+ maxpos = maxpos + lenb - sentence.length() + 6;
}
if (maxpos > maxLength) {
// the string is too long, even if we cut it at the end
// so cut it here at both ends at once
int newlen = maxpos - minpos + 10;
int around = (maxLength - newlen) / 2;
- result = "[..] " + result.substring(minpos - around, ((maxpos + around) > result.length()) ? result.length() : (maxpos + around)).trim() + " [..]";
+ sentence = "[..] " + sentence.substring(minpos - around, ((maxpos + around) > sentence.length()) ? sentence.length() : (maxpos + around)).trim() + " [..]";
minpos = around;
- maxpos = result.length() - around - 5;
+ maxpos = sentence.length() - around - 5;
}
- if (result.length() > maxLength) {
- // trim result, 1st step (cut at right side)
- result = result.substring(0, maxpos).trim() + " [..]";
+ if (sentence.length() > maxLength) {
+ // trim sentence, 1st step (cut at right side)
+ sentence = sentence.substring(0, maxpos).trim() + " [..]";
}
- if (result.length() > maxLength) {
- // trim result, 2nd step (cut at left side)
- result = "[..] " + result.substring(minpos).trim();
+ if (sentence.length() > maxLength) {
+ // trim sentence, 2nd step (cut at left side)
+ sentence = "[..] " + sentence.substring(minpos).trim();
}
- if (result.length() > maxLength) {
- // trim result, 3rd step (cut in the middle)
- result = result.substring(6, 20).trim() + " [..] " + result.substring(result.length() - 26, result.length() - 6).trim();
+ if (sentence.length() > maxLength) {
+ // trim sentence, 3rd step (cut in the middle)
+ sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim();
}
- if (queryhashes.size() == 0) return result;
- // the result has not all words in it.
- // find another sentence that represents the missing other words
- // and find recursively more sentences
- maxLength = maxLength - result.length();
- if (maxLength < 20) maxLength = 20;
- String nextSnippet = computeSnippet(sentences, remaininghashes, minLength, maxLength);
- return result + ((nextSnippet == null) ? "" : (" / " + nextSnippet));
+ return sentence;
} catch (IndexOutOfBoundsException e) {
log.logSevere("computeSnippet: error with string generation", e);
- return "";
+ return null;
+ }
+ }
+
+ private Set removeAppearanceHashes(String sentence, Set queryhashes) {
+ // remove all hashes that appear in the sentence
+ if (sentence == null) return queryhashes;
+ HashMap hs = hashSentence(sentence);
+ Iterator j = queryhashes.iterator();
+ String hash;
+ Integer pos;
+ Set remaininghashes = new HashSet();
+ while (j.hasNext()) {
+ hash = (String) j.next();
+ pos = (Integer) hs.get(hash);
+ if (pos == null) {
+ remaininghashes.add(new String(hash));
+ }
}
+ return remaininghashes;
}
private HashMap hashSentence(String sentence) {
diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java
index fa1c008bc..e42a43244 100644
--- a/source/de/anomic/plasma/plasmaSwitchboard.java
+++ b/source/de/anomic/plasma/plasmaSwitchboard.java
@@ -1576,7 +1576,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
checkInterruption();
log.logFine("Condensing for '" + entry.normalizedURLString() + "'");
- plasmaCondenser condenser = new plasmaCondenser(document);
+ plasmaCondenser condenser = new plasmaCondenser(document, true);
// generate citation reference
Integer[] ioLinks = generateCitationReference(entry.urlHash(), docDate, document, condenser); // [outlinksSame, outlinksOther]
@@ -1586,6 +1586,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
checkInterruption();
// create a new loaded URL db entry
+ long ldate = System.currentTimeMillis();
indexURLEntry newEntry = wordIndex.loadedURL.newEntry(
entry.url(), // URL
docDescription, // document description
@@ -1594,7 +1595,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
"", // ETag
docDate, // modification date
new Date(), // loaded date
- new Date(), // freshdate
+ new Date(ldate + Math.max(0, ldate - docDate.getTime()) / 2), // freshdate, computed with Proxy-TTL formula
referrerUrlHash, // referer hash
new byte[0], // md5
(int) entry.size(), // size
@@ -1655,16 +1656,16 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
* STORE PAGE INDEX INTO WORD INDEX DB
* ======================================================================== */
words = wordIndex.addPageIndex(
- entry.url(), // document url
- urlHash, // document url hash
- docDate, // document mod date
- (int) entry.size(), // document size
- document, // document content
- condenser, // document condenser
+ entry.url(), // document url
+ urlHash, // document url hash
+ docDate, // document mod date
+ (int) entry.size(), // document size
+ document, // document content
+ condenser, // document condenser
plasmaURL.language(entry.url()), // document language
plasmaURL.docType(document.getMimeType()), // document type
- ioLinks[0].intValue(), // outlinkSame
- ioLinks[1].intValue() // outlinkOthers
+ ioLinks[0].intValue(), // outlinkSame
+ ioLinks[1].intValue() // outlinkOthers
);
} else {
/* ========================================================================
@@ -1704,7 +1705,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
newEntry.size(),
docDate.getTime(),
System.currentTimeMillis(),
- condenser.RESULT_WORD_ENTROPHY,
language,
doctype,
ioLinks[0].intValue(),
@@ -1749,7 +1749,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
}
tmpContainers = null;
- }
+ } //end: SEND PAGE INDEX TO STORAGE PEER
+
storageEndTime = System.currentTimeMillis();
//increment number of indexed urls
@@ -2253,7 +2254,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// get the word set
Set words = null;
try {
- words = new plasmaCondenser(document).words().keySet();
+ words = new plasmaCondenser(document, true).words().keySet();
} catch (UnsupportedEncodingException e) {
e.printStackTrace();
}
diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java
index e126c51fe..0eaf94f26 100644
--- a/source/de/anomic/plasma/plasmaWordIndex.java
+++ b/source/de/anomic/plasma/plasmaWordIndex.java
@@ -251,22 +251,21 @@ public final class plasmaWordIndex implements indexRI {
// this is called by the switchboard to put in a new page into the index
// use all the words in one condenser object to simultanous create index entries
- // iterate over all words
+ int wordCount = 0;
+ int urlLength = url.toString().length();
+ int urlComps = htmlFilterContentScraper.urlComps(url.toString()).length;
+
+ // iterate over all words of context text
Iterator i = condenser.words().entrySet().iterator();
Map.Entry wentry;
String word;
indexRWIEntry ientry;
plasmaCondenser.wordStatProp wprop;
- String wordHash;
- int urlLength = url.toString().length();
- int urlComps = htmlFilterContentScraper.urlComps(url.toString()).length;
-
while (i.hasNext()) {
wentry = (Map.Entry) i.next();
word = (String) wentry.getKey();
wprop = (plasmaCondenser.wordStatProp) wentry.getValue();
- // if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c);
- wordHash = plasmaCondenser.word2hash(word);
+ assert (wprop.flags != null);
ientry = new indexRWIEntryNew(urlHash,
urlLength, urlComps, (document == null) ? urlLength : document.getMainLongTitle().length(),
wprop.count,
@@ -279,16 +278,15 @@ public final class plasmaWordIndex implements indexRI {
size,
urlModified.getTime(),
System.currentTimeMillis(),
- condenser.RESULT_WORD_ENTROPHY,
language,
doctype,
outlinksSame, outlinksOther,
- condenser.RESULT_FLAGS);
- addEntry(wordHash, ientry, System.currentTimeMillis(), false);
+ wprop.flags);
+ addEntry(plasmaCondenser.word2hash(word), ientry, System.currentTimeMillis(), false);
+ wordCount++;
}
- // System.out.println("DEBUG: plasmaSearch.addPageIndex: added " +
- // condenser.getWords().size() + " words, flushed " + c + " entries");
- return condenser.RESULT_SIMI_WORDS;
+
+ return wordCount;
}
public indexContainer getContainer(String wordHash, Set urlselection, long maxTime) {