- added correct flagging of word properties

- added self-healing to database in case that wrong free-pointers exist
- added presentation of media links in snippets (does not yet work correctly)
- code cleanup

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3055 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 10d888e70c
commit bf0d820659

@ -300,7 +300,7 @@ public class IndexControl_p {
"true".equalsIgnoreCase(gzipBody), "true".equalsIgnoreCase(gzipBody),
timeout); timeout);
result = (String) resultObj.get("result"); result = (String) resultObj.get("result");
prop.put("result", (result == null) ? ("Successfully transferred " + index.size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds") : result); prop.put("result", (result == null) ? ("Successfully transferred " + knownURLs.size() + " words in " + ((System.currentTimeMillis() - starttime) / 1000) + " seconds, " + unknownURLEntries + " URL not found") : result);
index = null; index = null;
} }

@ -203,7 +203,7 @@ public final class transferRWI {
} }
if (unknownURLs.length() > 0) { unknownURLs.delete(0, 1); } if (unknownURLs.length() > 0) { unknownURLs.delete(0, 1); }
if ((wordhashes.length == 0) || (received == 0)) { if ((wordhashes.length == 0) || (received == 0)) {
sb.getLog().logInfo("Received 0 RWIs from " + otherPeerName + ", processed in " + (System.currentTimeMillis() - startProcess) + " milliseconds, requesting " + unknownURL.size() + " URLs"); sb.getLog().logInfo("Received 0 RWIs from " + otherPeerName + ", processed in " + (System.currentTimeMillis() - startProcess) + " milliseconds, requesting " + unknownURL.size() + " URLs, blocked " + blocked + " RWIs");
} else { } else {
final double avdist = (yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, wordhashes[0]) + yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, wordhashes[received - 1])) / 2.0; final double avdist = (yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, wordhashes[0]) + yacyDHTAction.dhtDistance(yacyCore.seedDB.mySeed.hash, wordhashes[received - 1])) / 2.0;
sb.getLog().logInfo("Received " + received + " Entries " + wordc + " Words [" + wordhashes[0] + " .. " + wordhashes[received - 1] + "]/" + avdist + " from " + otherPeerName + ", processed in " + (System.currentTimeMillis() - startProcess) + " milliseconds, requesting " + unknownURL.size() + "/" + receivedURL + " URLs, blocked " + blocked + " RWIs"); sb.getLog().logInfo("Received " + received + " Entries " + wordc + " Words [" + wordhashes[0] + " .. " + wordhashes[received - 1] + "]/" + avdist + " from " + otherPeerName + ", processed in " + (System.currentTimeMillis() - startProcess) + " milliseconds, requesting " + unknownURL.size() + "/" + receivedURL + " URLs, blocked " + blocked + " RWIs");

@ -46,10 +46,12 @@
// javac -classpath .:../classes transferRWI.java // javac -classpath .:../classes transferRWI.java
import java.io.IOException; import java.io.IOException;
import java.text.ParseException;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.index.indexURLEntry; import de.anomic.index.indexURLEntry;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.urlPattern.plasmaURLPattern; import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverCore; import de.anomic.server.serverCore;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
@ -59,10 +61,13 @@ import de.anomic.yacy.yacySeed;
public final class transferURL { public final class transferURL {
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch ss) throws InterruptedException { public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch ss) throws InterruptedException {
if (post == null || ss == null) { return null; } if (post == null || ss == null) { return null; }
long start = System.currentTimeMillis(); long start = System.currentTimeMillis();
long freshdate = 0;
try {freshdate = plasmaURL.shortDayFormatter.parse("20061101").getTime();} catch (ParseException e1) {}
// return variable that accumulates replacements // return variable that accumulates replacements
final plasmaSwitchboard sb = (plasmaSwitchboard) ss; final plasmaSwitchboard sb = (plasmaSwitchboard) ss;
@ -93,26 +98,39 @@ public final class transferURL {
indexURLEntry lEntry; indexURLEntry lEntry;
for (int i = 0; i < urlc; i++) { for (int i = 0; i < urlc; i++) {
serverCore.checkInterruption(); serverCore.checkInterruption();
// read new lurl-entry
urls = (String) post.get("url" + i); urls = (String) post.get("url" + i);
if (urls == null) { if (urls == null) {
yacyCore.log.logFine("transferURL: got null URL-string from peer " + otherPeerName); yacyCore.log.logFine("transferURL: got null URL-string from peer " + otherPeerName);
} else { continue;
}
// parse new lurl-entry
lEntry = sb.wordIndex.loadedURL.newEntry(urls); lEntry = sb.wordIndex.loadedURL.newEntry(urls);
if (lEntry == null) { if (lEntry == null) {
yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls); yacyCore.log.logWarning("transferURL: received invalid URL (entry null) from peer " + otherPeerName + "\n\tURL Property: " + urls);
// TODO: should we send back an error message??? continue;
} else { }
// check if entry is well-formed
indexURLEntry.Components comp = lEntry.comp(); indexURLEntry.Components comp = lEntry.comp();
if (comp.url() == null) { if ((comp.url() == null) || (lEntry.freshdate().getTime() <= freshdate)) {
yacyCore.log.logWarning("transferURL: received invalid URL (url null) from peer " + otherPeerName + "\n\tURL Property: " + urls); yacyCore.log.logWarning("transferURL: received invalid URL from peer " + otherPeerName + "\n\tURL Property: " + urls);
// TODO: should we send back an error message??? continue;
} else { }
// check if the entry is blacklisted
if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, lEntry.hash(), comp.url()))) { if ((blockBlacklist) && (plasmaSwitchboard.urlBlacklist.isListed(plasmaURLPattern.BLACKLIST_DHT, lEntry.hash(), comp.url()))) {
int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash()); int deleted = sb.wordIndex.tryRemoveURLs(lEntry.hash());
yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs"); yacyCore.log.logFine("transferURL: blocked blacklisted URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName + "; deleted " + deleted + " URL entries from RWIs");
lEntry = null; lEntry = null;
blocked++; blocked++;
} else try { continue;
}
// write entry to database
try {
sb.wordIndex.loadedURL.store(lEntry); sb.wordIndex.loadedURL.store(lEntry);
sb.wordIndex.loadedURL.stack(lEntry, iam, iam, 3); sb.wordIndex.loadedURL.stack(lEntry, iam, iam, 3);
yacyCore.log.logFine("transferURL: received URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName); yacyCore.log.logFine("transferURL: received URL '" + comp.url().toNormalform() + "' from peer " + otherPeerName);
@ -121,9 +139,6 @@ public final class transferURL {
e.printStackTrace(); e.printStackTrace();
} }
} }
}
}
}
yacyCore.seedDB.mySeed.incRU(received); yacyCore.seedDB.mySeed.incRU(received);

@ -117,7 +117,6 @@ public class indexRWIEntryNew implements Cloneable, indexRWIEntry {
int sizeOfPage, // # of bytes of the page TODO: not needed any more int sizeOfPage, // # of bytes of the page TODO: not needed any more
long lastmodified, // last-modified time of the document where word appears long lastmodified, // last-modified time of the document where word appears
long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
int quality, // the entropy value
String language, // (guessed) language of document String language, // (guessed) language of document
char doctype, // type of document char doctype, // type of document
int outlinksSame, // outlinks to same domain int outlinksSame, // outlinks to same domain

@ -33,7 +33,6 @@ import de.anomic.kelondro.kelondroColumn;
import de.anomic.kelondro.kelondroRow; import de.anomic.kelondro.kelondroRow;
import de.anomic.kelondro.kelondroRow.Entry; import de.anomic.kelondro.kelondroRow.Entry;
import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.plasmaWordIndex; import de.anomic.plasma.plasmaWordIndex;
import de.anomic.yacy.yacySeedDB; import de.anomic.yacy.yacySeedDB;
@ -66,7 +65,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
private static final int col_hitcount = 3; private static final int col_hitcount = 3;
private static final int col_language = 4; private static final int col_language = 4;
private static final int col_doctype = 5; private static final int col_doctype = 5;
private static final int col_localflag = 6; //private static final int col_localflag = 6;
private static final int col_posintext = 7; private static final int col_posintext = 7;
private static final int col_posinphrase = 8; private static final int col_posinphrase = 8;
private static final int col_posofphrase = 9; private static final int col_posofphrase = 9;
@ -77,6 +76,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
private kelondroRow.Entry entry; private kelondroRow.Entry entry;
/*
public indexRWIEntryOld(String urlHash, public indexRWIEntryOld(String urlHash,
int urlLength, // byte-length of complete URL int urlLength, // byte-length of complete URL
int urlComps, // number of path components int urlComps, // number of path components
@ -91,7 +91,6 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
int sizeOfPage, // # of bytes of the page int sizeOfPage, // # of bytes of the page
long lastmodified, //*last-modified time of the document where word appears long lastmodified, //*last-modified time of the document where word appears
long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
int quality, //*the entropy value
String language, //*(guessed) language of document String language, //*(guessed) language of document
char doctype, //*type of document char doctype, //*type of document
int outlinksSame, // outlinks to same domain int outlinksSame, // outlinks to same domain
@ -107,7 +106,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
if ((language == null) || (language.length() != urlEntryRow.width(col_language))) language = "uk"; if ((language == null) || (language.length() != urlEntryRow.width(col_language))) language = "uk";
this.entry = urlEntryRow.newEntry(); this.entry = urlEntryRow.newEntry();
this.entry.setCol(col_urlhash, urlHash, null); this.entry.setCol(col_urlhash, urlHash, null);
this.entry.setCol(col_quality, quality); this.entry.setCol(col_quality, 0);
this.entry.setCol(col_lastModified, lastmodified); this.entry.setCol(col_lastModified, lastmodified);
this.entry.setCol(col_hitcount, hitcount); this.entry.setCol(col_hitcount, hitcount);
this.entry.setCol(col_language, language, null); this.entry.setCol(col_language, language, null);
@ -121,7 +120,7 @@ public class indexRWIEntryOld implements Cloneable, indexRWIEntry {
this.entry.setCol(col_phrasecount, phrasecount); this.entry.setCol(col_phrasecount, phrasecount);
//System.out.println("DEBUG-NEWENTRY " + toPropertyForm()); //System.out.println("DEBUG-NEWENTRY " + toPropertyForm());
} }
*/
public indexRWIEntryOld(String urlHash, String code) { public indexRWIEntryOld(String urlHash, String code) {
// the code is the external form of the row minus the leading urlHash entry // the code is the external form of the row minus the leading urlHash entry
this.entry = urlEntryRow.newEntry((urlHash + code).getBytes()); this.entry = urlEntryRow.newEntry((urlHash + code).getBytes());

@ -24,7 +24,7 @@
package de.anomic.kelondro; package de.anomic.kelondro;
public class kelondroBitfield { public class kelondroBitfield implements Cloneable {
// the bitfield implements a binary array. Such arrays may be exported in a base64-String // the bitfield implements a binary array. Such arrays may be exported in a base64-String
@ -55,6 +55,12 @@ public class kelondroBitfield {
} }
} }
public Object clone() {
kelondroBitfield theClone = new kelondroBitfield(new byte[this.bb.length]);
System.arraycopy(this.bb, 0, theClone.bb, 0, this.bb.length);
return theClone;
}
public void set(int pos, boolean value) { public void set(int pos, boolean value) {
assert (pos >= 0); assert (pos >= 0);
int slot = pos / 8; int slot = pos / 8;

@ -1392,7 +1392,7 @@ public class kelondroRecords {
USAGE.FREEC--; USAGE.FREEC--;
// take link // take link
if (USAGE.FREEH.index == NUL) { if (USAGE.FREEH.index == NUL) {
System.out.println("INTERNAL ERROR (DATA INCONSISTENCY): re-use of records failed, lost " + (USAGE.FREEC + 1) + " records. Affected file: " + filename); serverLog.logSevere("kelondroRecords/" + filename, "INTERNAL ERROR (DATA INCONSISTENCY): re-use of records failed, lost " + (USAGE.FREEC + 1) + " records.");
// try to heal.. // try to heal..
USAGE.USEDC = USAGE.allCount() + 1; USAGE.USEDC = USAGE.allCount() + 1;
USAGE.FREEC = 0; USAGE.FREEC = 0;
@ -1402,11 +1402,18 @@ public class kelondroRecords {
//System.out.println("*DEBUG* ALLOCATED DELETED INDEX " + index); //System.out.println("*DEBUG* ALLOCATED DELETED INDEX " + index);
// check for valid seek position // check for valid seek position
long seekp = seekpos(USAGE.FREEH); long seekp = seekpos(USAGE.FREEH);
if (seekp > entryFile.length()) throw new kelondroException("new Handle: seek position " + seekp + "/" + USAGE.FREEH.index + " out of file size " + entryFile.length() + "/" + ((entryFile.length() - POS_NODES) / recordsize)); if (seekp > entryFile.length()) {
// this is a severe inconsistency. try to heal..
serverLog.logSevere("kelondroRecords/" + filename, "new Handle: lost " + USAGE.FREEC + " marked nodes; seek position " + seekp + "/" + USAGE.FREEH.index + " out of file size " + entryFile.length() + "/" + ((entryFile.length() - POS_NODES) / recordsize));
index = USAGE.allCount(); // a place at the end of the file
USAGE.USEDC += USAGE.FREEC; // to avoid that non-empty records at the end are overwritten
USAGE.FREEC = 0; // discard all possible empty nodes
USAGE.FREEH.index = NUL;
} else {
// read link to next element of FREEH chain // read link to next element of FREEH chain
USAGE.FREEH.index = entryFile.readInt(seekp); USAGE.FREEH.index = entryFile.readInt(seekp);
} }
}
USAGE.write(); USAGE.write();
} }
} }

@ -49,7 +49,6 @@ import java.io.ByteArrayInputStream;
import java.io.File; import java.io.File;
import java.io.FileInputStream; import java.io.FileInputStream;
import java.io.FileNotFoundException; import java.io.FileNotFoundException;
import java.io.FileWriter;
import java.io.IOException; import java.io.IOException;
import java.io.InputStream; import java.io.InputStream;
import java.io.InputStreamReader; import java.io.InputStreamReader;
@ -66,6 +65,8 @@ import java.util.TreeMap;
import java.util.TreeSet; import java.util.TreeSet;
import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterImageEntry;
import de.anomic.index.indexRWIEntryNew;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield; import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroMSetTools; import de.anomic.kelondro.kelondroMSetTools;
@ -114,18 +115,121 @@ public final class plasmaCondenser {
public int RESULT_NUMB_WORDS = -1; public int RESULT_NUMB_WORDS = -1;
public int RESULT_DIFF_WORDS = -1; public int RESULT_DIFF_WORDS = -1;
public int RESULT_SIMI_WORDS = -1; public int RESULT_SIMI_WORDS = -1;
public int RESULT_WORD_ENTROPHY = -1;
public int RESULT_NUMB_SENTENCES = -1; public int RESULT_NUMB_SENTENCES = -1;
public int RESULT_DIFF_SENTENCES = -1; public int RESULT_DIFF_SENTENCES = -1;
public int RESULT_SIMI_SENTENCES = -1; public int RESULT_SIMI_SENTENCES = -1;
public kelondroBitfield RESULT_FLAGS = new kelondroBitfield(4); public kelondroBitfield RESULT_FLAGS = new kelondroBitfield(4);
public plasmaCondenser(plasmaParserDocument document) throws UnsupportedEncodingException { public plasmaCondenser(plasmaParserDocument document, boolean addMedia) throws UnsupportedEncodingException {
// if addMedia == true, then all the media links are also parsed and added to the words
// added media words are flagged with the approriate media flag
this(document.getText(), document.getCharset()); this(document.getText(), document.getCharset());
kelondroBitfield wflags = (kelondroBitfield) RESULT_FLAGS.clone(); // the template for the word flags, only from position 0..19
// construct flag set for document
if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true); if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true);
if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true); if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true);
if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true); if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true);
if (document.getApplinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasapp, true); if (document.getApplinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasapp, true);
// the phrase counter:
// phrase 0 are words taken from the URL
// phrase 1 is the MainLongTitle
// phrase 2 is the MainShortTitle
// phrase 3 is the Document Abstract
// phrase 4 is the Document Author
// phrase 5 are the tags specified in document
// phrase 10 and above are the section headlines/titles (88 possible)
// phrase 98 is taken from the embedded anchor/hyperlinks description
// phrase 99 is taken from the media Link url and anchor description
// phrase 100 and above are lines from the text
insertTextToWords(document.getMainLongTitle(), 1, indexRWIEntryNew.flag_app_descr, wflags);
insertTextToWords(document.getMainShortTitle(), 2, indexRWIEntryNew.flag_app_descr, wflags);
insertTextToWords(document.getAbstract(), 3, indexRWIEntryNew.flag_app_descr, wflags);
// missing: author!
// missing: tags!
String[] titles = document.getSectionTitles();
for (int i = 0; 1 < titles.length; i++) {
insertTextToWords(titles[i], i + 10, indexRWIEntryNew.flag_app_emphasized, wflags);
}
// anchors
Iterator i = document.getAnchors().entrySet().iterator();
Map.Entry entry;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
insertTextToWords((String) entry.getKey(), 98, indexRWIEntryNew.flag_app_url, wflags);
insertTextToWords((String) entry.getValue(), 98, indexRWIEntryNew.flag_app_url, wflags);
}
// audio
i = document.getAudiolinks().entrySet().iterator();
while (i.hasNext()) {
entry = (Map.Entry) i.next();
insertTextToWords((String) entry.getKey(), 99, flag_cat_hasaudio, wflags);
insertTextToWords((String) entry.getValue(), 99, flag_cat_hasaudio, wflags);
}
// video
i = document.getVideolinks().entrySet().iterator();
while (i.hasNext()) {
entry = (Map.Entry) i.next();
insertTextToWords((String) entry.getKey(), 99, flag_cat_hasvideo, wflags);
insertTextToWords((String) entry.getValue(), 99, flag_cat_hasvideo, wflags);
}
// applications
i = document.getApplinks().entrySet().iterator();
while (i.hasNext()) {
entry = (Map.Entry) i.next();
insertTextToWords((String) entry.getKey(), 99, flag_cat_hasapp, wflags);
insertTextToWords((String) entry.getValue(), 99, flag_cat_hasapp, wflags);
}
// images
i = document.getImages().iterator();
htmlFilterImageEntry ientry;
while (i.hasNext()) {
ientry = (htmlFilterImageEntry) i.next();
insertTextToWords((String) ientry.url().toNormalform(), 99, flag_cat_hasimage, wflags);
insertTextToWords((String) ientry.alt(), 99, flag_cat_hasimage, wflags);
}
// finally check all words for missing flag entry
i = words.entrySet().iterator();
wordStatProp wprop;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
wprop = (wordStatProp) entry.getValue();
if (wprop.flags == null) {
wprop.flags = (kelondroBitfield) wflags.clone();
words.put(entry.getKey(), wprop);
}
}
}
private void insertTextToWords(String text, int phrase, int flagpos, kelondroBitfield flagstemplate) {
String word;
wordStatProp wprop;
sievedWordsEnum wordenum;
try {
wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes()), "UTF-8", 3);
} catch (UnsupportedEncodingException e) {
return;
}
int pip = 0;
while (wordenum.hasMoreElements()) {
word = ((String) wordenum.nextElement()).toLowerCase();
wprop = (wordStatProp) words.get(word);
if (wprop == null) wprop = new wordStatProp(0, pip, phrase);
if (wprop.flags == null) wprop.flags = (kelondroBitfield) flagstemplate.clone();
wprop.numOfPhrase = 1;
wprop.posInPhrase = pip;
wprop.flags.set(flagpos, true);
words.put(word, wprop);
pip++;
}
} }
public plasmaCondenser(InputStream text, String charset) throws UnsupportedEncodingException { public plasmaCondenser(InputStream text, String charset) throws UnsupportedEncodingException {
@ -174,7 +278,7 @@ public final class plasmaCondenser {
} }
public Map words() { public Map words() {
// returns the words as wod/wordStatProp relation map // returns the words as word/wordStatProp relation map
return words; return words;
} }
@ -183,9 +287,10 @@ public final class plasmaCondenser {
public int count; // number of occurrences public int count; // number of occurrences
public int posInText; // unique handle, is initialized with word position (excluding double occurring words) public int posInText; // unique handle, is initialized with word position (excluding double occurring words)
public int posInPhrase; // public int posInPhrase; // position of word in phrase
public int numOfPhrase; public int numOfPhrase; // number of phrase. 'normal' phrases begin with number 100
public HashSet hash; // public HashSet hash; // a set of handles to all sentences where this word appears
public kelondroBitfield flags; // the flag bits for each word
public wordStatProp(int handle, int pip, int nop) { public wordStatProp(int handle, int pip, int nop) {
this.count = 1; this.count = 1;
@ -193,6 +298,7 @@ public final class plasmaCondenser {
this.posInPhrase = pip; this.posInPhrase = pip;
this.numOfPhrase = nop; this.numOfPhrase = nop;
this.hash = new HashSet(); this.hash = new HashSet();
this.flags = null;
} }
public void inc() { public void inc() {
@ -314,7 +420,7 @@ public final class plasmaCondenser {
} else { } else {
// word does not yet exist, create new word entry // word does not yet exist, create new word entry
wordHandle = wordHandleCount++; wordHandle = wordHandleCount++;
wsp = new wordStatProp(wordHandle, wordInSentenceCounter, sentences.size() + 1); wsp = new wordStatProp(wordHandle, wordInSentenceCounter, sentences.size() + 100);
} }
words.put(word, wsp); words.put(word, wsp);
// we now have the unique handle of the word, put it into the sentence: // we now have the unique handle of the word, put it into the sentence:
@ -429,7 +535,6 @@ public final class plasmaCondenser {
this.RESULT_NUMB_WORDS = allwordcounter; this.RESULT_NUMB_WORDS = allwordcounter;
this.RESULT_DIFF_WORDS = wordHandleCount; this.RESULT_DIFF_WORDS = wordHandleCount;
this.RESULT_SIMI_WORDS = words.size(); this.RESULT_SIMI_WORDS = words.size();
this.RESULT_WORD_ENTROPHY = (allwordcounter == 0) ? 0 : (255 * words.size() / allwordcounter);
this.RESULT_NUMB_SENTENCES = allsentencecounter; this.RESULT_NUMB_SENTENCES = allsentencecounter;
this.RESULT_DIFF_SENTENCES = sentenceHandleCount; this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
this.RESULT_SIMI_SENTENCES = sentences.size(); this.RESULT_SIMI_SENTENCES = sentences.size();
@ -508,6 +613,7 @@ public final class plasmaCondenser {
return orderedSentences; return orderedSentences;
} }
/*
public void writeMapToFile(File out) throws IOException { public void writeMapToFile(File out) throws IOException {
Map.Entry entry; Map.Entry entry;
String k; String k;
@ -520,7 +626,7 @@ public final class plasmaCondenser {
// we reconstruct the word hashtable // we reconstruct the word hashtable
// and sort the entries by the number of occurrences // and sort the entries by the number of occurrences
// this structure is needed to print out a sorted list of words // this structure is needed to print out a sorted list of words
TreeMap sortedWords = new TreeMap(/*kelondroNaturalOrder.naturalOrder*/); TreeMap sortedWords = new TreeMap(); //kelondroNaturalOrder.naturalOrder
it = words.entrySet().iterator(); // enumerates the keys in ascending order it = words.entrySet().iterator(); // enumerates the keys in ascending order
while (it.hasNext()) { while (it.hasNext()) {
entry = (Map.Entry) it.next(); entry = (Map.Entry) it.next();
@ -549,7 +655,7 @@ public final class plasmaCondenser {
} }
writer.close(); writer.close();
} }
*/
public final static boolean invisible(char c) { public final static boolean invisible(char c) {
// TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars? // TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars?
if ((c < ' ') || (c > 'z')) return true; if ((c < ' ') || (c > 'z')) return true;
@ -771,16 +877,22 @@ public final class plasmaCondenser {
} }
public static Map getWords(InputStream input, String charset) throws UnsupportedEncodingException {
if (input == null) return null;
plasmaCondenser condenser = new plasmaCondenser(input, charset);
return condenser.words;
}
public static Map getWords(byte[] text, String charset) throws UnsupportedEncodingException { public static Map getWords(byte[] text, String charset) throws UnsupportedEncodingException {
// returns a word/wordStatProp relation map
if (text == null) return null; if (text == null) return null;
ByteArrayInputStream buffer = new ByteArrayInputStream(text); ByteArrayInputStream buffer = new ByteArrayInputStream(text);
return getWords(buffer, charset); return new plasmaCondenser(buffer, charset, 2, 1).words();
}
public static Map getWords(String text) {
// returns a word/wordStatProp relation map
if (text == null) return null;
ByteArrayInputStream buffer = new ByteArrayInputStream(text.getBytes());
try {
return new plasmaCondenser(buffer, "UTF-8", 2, 1).words();
} catch (UnsupportedEncodingException e) {
return null;
}
} }
public static void main(String[] args) { public static void main(String[] args) {

@ -220,6 +220,7 @@ public class plasmaParserDocument {
public Map getAnchors() { public Map getAnchors() {
// returns all links embedded as anchors (clickeable entities) // returns all links embedded as anchors (clickeable entities)
// this is a url(String)/text(String) map
return anchors; return anchors;
} }

@ -52,6 +52,7 @@ import java.util.Enumeration;
import java.util.HashMap; import java.util.HashMap;
import java.util.HashSet; import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map;
import java.util.Set; import java.util.Set;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
@ -256,26 +257,37 @@ public class plasmaSnippetCache {
} }
if (document == null) return new Snippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed if (document == null) return new Snippet(null, ERROR_PARSER_FAILED, "parser error/failed"); // cannot be parsed
//System.out.println("loaded document for URL " + url);
final Enumeration sentences = document.getSentences(pre);
document.close();
//System.out.println("----" + url.toString()); for (int l = 0; l < sentences.length; l++) System.out.println(sentences[l]);
if (sentences == null) {
//System.out.println("found no sentences in url " + url);
return new Snippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences");
}
/* =========================================================================== /* ===========================================================================
* COMPUTE SNIPPET * COMPUTE SNIPPET
* =========================================================================== */ * =========================================================================== */
// we have found a parseable non-empty file: use the lines // we have found a parseable non-empty file: use the lines
line = computeSnippet(sentences, queryhashes, 3 * queryhashes.size(), snippetMaxLength);
//System.out.println("loaded snippet for URL " + url + ": " + line); // compute snippet from text
final Enumeration sentences = document.getSentences(pre);
if (sentences == null) return new Snippet(null, ERROR_PARSER_NO_LINES, "parser returned no sentences");
String textline = computeTextSnippet(sentences, queryhashes, 3 * queryhashes.size(), snippetMaxLength);
// compute snippet from media
String audioline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
String videoline = computeMediaSnippet(document.getVideolinks(), queryhashes);
String appline = computeMediaSnippet(document.getApplinks(), queryhashes);
//String hrefline = computeMediaSnippet(document.getAnchors(), queryhashes);
//String imageline = computeMediaSnippet(document.getAudiolinks(), queryhashes);
line = "";
if (audioline != null) line += (line.length() == 0) ? audioline : "<br />" + audioline;
if (videoline != null) line += (line.length() == 0) ? videoline : "<br />" + videoline;
if (appline != null) line += (line.length() == 0) ? appline : "<br />" + appline;
//if (hrefline != null) line += (line.length() == 0) ? hrefline : "<br />" + hrefline;
if (textline != null) line += (line.length() == 0) ? textline : "<br />" + textline;
if (line == null) return new Snippet(null, ERROR_NO_MATCH, "no matching snippet found"); if (line == null) return new Snippet(null, ERROR_NO_MATCH, "no matching snippet found");
if (line.length() > snippetMaxLength) line = line.substring(0, snippetMaxLength); if (line.length() > snippetMaxLength) line = line.substring(0, snippetMaxLength);
// finally store this snippet in our own cache // finally store this snippet in our own cache
storeToCache(wordhashes, urlhash, line); storeToCache(wordhashes, urlhash, line);
document.close();
return new Snippet(line, source, null); return new Snippet(line, source, null);
} }
@ -366,7 +378,32 @@ public class plasmaSnippetCache {
return (String) snippetsCache.get(key); return (String) snippetsCache.get(key);
} }
private String computeSnippet(Enumeration sentences, Set queryhashes, int minLength, int maxLength) { private String computeMediaSnippet(Map media, Set queryhashes) {
Iterator i = media.entrySet().iterator();
Map.Entry entry;
String url, desc;
Set s;
String result = "";
while (i.hasNext()) {
entry = (Map.Entry) i.next();
url = (String) entry.getKey();
desc = (String) entry.getValue();
s = removeAppearanceHashes(url, queryhashes);
if (s.size() == 0) {
result += "<br /><a href=\"" + url + "\">" + ((desc.length() == 0) ? url : desc) + "</a>";
continue;
}
s = removeAppearanceHashes(desc, s);
if (s.size() == 0) {
result += "<br /><a href=\"" + url + "\">" + ((desc.length() == 0) ? url : desc) + "</a>";
continue;
}
}
if (result.length() == 0) return null;
return result.substring(6);
}
private String computeTextSnippet(Enumeration sentences, Set queryhashes, int minLength, int maxLength) {
try { try {
if (sentences == null) return null; if (sentences == null) return null;
if ((queryhashes == null) || (queryhashes.size() == 0)) return null; if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
@ -404,20 +441,43 @@ public class plasmaSnippetCache {
shortLineLength = ((String) sb.get(i)).length(); shortLineLength = ((String) sb.get(i)).length();
} }
} }
// find a first result // find a first result
String result = (String) sb.get(shortLineIndex); String result = computeTextSnippet((String) sb.get(shortLineIndex), queryhashes, minLength, maxLength);
// remove all hashes that appear in the result Set remaininghashes = removeAppearanceHashes(result, queryhashes);
hs = hashSentence(result);
if (remaininghashes.size() == 0) return result;
// the result has not all words in it.
// find another sentence that represents the missing other words
// and find recursively more sentences
maxLength = maxLength - result.length();
if (maxLength < 20) maxLength = 20;
String nextSnippet = computeTextSnippet(sentences, remaininghashes, minLength, maxLength);
if (nextSnippet == null) return null;
return result + (" / " + nextSnippet);
} catch (IndexOutOfBoundsException e) {
log.logSevere("computeSnippet: error with string generation", e);
return "";
}
}
private String computeTextSnippet(String sentence, Set queryhashes, int minLength, int maxLength) {
try {
if (sentence == null) return null;
if ((queryhashes == null) || (queryhashes.size() == 0)) return null;
Iterator j;
HashMap hs;
String hash;
// find all hashes that appear in the sentence
hs = hashSentence(sentence);
j = queryhashes.iterator(); j = queryhashes.iterator();
Integer pos; Integer pos;
Set remaininghashes = new HashSet(); int p, minpos = sentence.length(), maxpos = -1;
int p, minpos = result.length(), maxpos = -1;
while (j.hasNext()) { while (j.hasNext()) {
hash = (String) j.next(); hash = (String) j.next();
pos = (Integer) hs.get(hash); pos = (Integer) hs.get(hash);
if (pos == null) { if (pos != null) {
remaininghashes.add(new String(hash));
} else {
p = pos.intValue(); p = pos.intValue();
if (p > maxpos) maxpos = p; if (p > maxpos) maxpos = p;
if (p < minpos) minpos = p; if (p < minpos) minpos = p;
@ -425,51 +485,62 @@ public class plasmaSnippetCache {
} }
// check result size // check result size
maxpos = maxpos + 10; maxpos = maxpos + 10;
if (maxpos > result.length()) maxpos = result.length(); if (maxpos > sentence.length()) maxpos = sentence.length();
if (minpos < 0) minpos = 0; if (minpos < 0) minpos = 0;
// we have a result, but is it short enough? // we have a result, but is it short enough?
if (maxpos - minpos + 10 > maxLength) { if (maxpos - minpos + 10 > maxLength) {
// the string is too long, even if we cut at both ends // the string is too long, even if we cut at both ends
// so cut here in the middle of the string // so cut here in the middle of the string
int lenb = result.length(); int lenb = sentence.length();
result = result.substring(0, (minpos + 20 > result.length()) ? result.length() : minpos + 20).trim() + sentence = sentence.substring(0, (minpos + 20 > sentence.length()) ? sentence.length() : minpos + 20).trim() +
" [..] " + " [..] " +
result.substring((maxpos + 26 > result.length()) ? result.length() : maxpos + 26).trim(); sentence.substring((maxpos + 26 > sentence.length()) ? sentence.length() : maxpos + 26).trim();
maxpos = maxpos + lenb - result.length() + 6; maxpos = maxpos + lenb - sentence.length() + 6;
} }
if (maxpos > maxLength) { if (maxpos > maxLength) {
// the string is too long, even if we cut it at the end // the string is too long, even if we cut it at the end
// so cut it here at both ends at once // so cut it here at both ends at once
int newlen = maxpos - minpos + 10; int newlen = maxpos - minpos + 10;
int around = (maxLength - newlen) / 2; int around = (maxLength - newlen) / 2;
result = "[..] " + result.substring(minpos - around, ((maxpos + around) > result.length()) ? result.length() : (maxpos + around)).trim() + " [..]"; sentence = "[..] " + sentence.substring(minpos - around, ((maxpos + around) > sentence.length()) ? sentence.length() : (maxpos + around)).trim() + " [..]";
minpos = around; minpos = around;
maxpos = result.length() - around - 5; maxpos = sentence.length() - around - 5;
} }
if (result.length() > maxLength) { if (sentence.length() > maxLength) {
// trim result, 1st step (cut at right side) // trim sentence, 1st step (cut at right side)
result = result.substring(0, maxpos).trim() + " [..]"; sentence = sentence.substring(0, maxpos).trim() + " [..]";
} }
if (result.length() > maxLength) { if (sentence.length() > maxLength) {
// trim result, 2nd step (cut at left side) // trim sentence, 2nd step (cut at left side)
result = "[..] " + result.substring(minpos).trim(); sentence = "[..] " + sentence.substring(minpos).trim();
} }
if (result.length() > maxLength) { if (sentence.length() > maxLength) {
// trim result, 3rd step (cut in the middle) // trim sentence, 3rd step (cut in the middle)
result = result.substring(6, 20).trim() + " [..] " + result.substring(result.length() - 26, result.length() - 6).trim(); sentence = sentence.substring(6, 20).trim() + " [..] " + sentence.substring(sentence.length() - 26, sentence.length() - 6).trim();
} }
if (queryhashes.size() == 0) return result; return sentence;
// the result has not all words in it.
// find another sentence that represents the missing other words
// and find recursively more sentences
maxLength = maxLength - result.length();
if (maxLength < 20) maxLength = 20;
String nextSnippet = computeSnippet(sentences, remaininghashes, minLength, maxLength);
return result + ((nextSnippet == null) ? "" : (" / " + nextSnippet));
} catch (IndexOutOfBoundsException e) { } catch (IndexOutOfBoundsException e) {
log.logSevere("computeSnippet: error with string generation", e); log.logSevere("computeSnippet: error with string generation", e);
return ""; return null;
}
}
private Set removeAppearanceHashes(String sentence, Set queryhashes) {
// remove all hashes that appear in the sentence
if (sentence == null) return queryhashes;
HashMap hs = hashSentence(sentence);
Iterator j = queryhashes.iterator();
String hash;
Integer pos;
Set remaininghashes = new HashSet();
while (j.hasNext()) {
hash = (String) j.next();
pos = (Integer) hs.get(hash);
if (pos == null) {
remaininghashes.add(new String(hash));
}
} }
return remaininghashes;
} }
private HashMap hashSentence(String sentence) { private HashMap hashSentence(String sentence) {

@ -1576,7 +1576,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
checkInterruption(); checkInterruption();
log.logFine("Condensing for '" + entry.normalizedURLString() + "'"); log.logFine("Condensing for '" + entry.normalizedURLString() + "'");
plasmaCondenser condenser = new plasmaCondenser(document); plasmaCondenser condenser = new plasmaCondenser(document, true);
// generate citation reference // generate citation reference
Integer[] ioLinks = generateCitationReference(entry.urlHash(), docDate, document, condenser); // [outlinksSame, outlinksOther] Integer[] ioLinks = generateCitationReference(entry.urlHash(), docDate, document, condenser); // [outlinksSame, outlinksOther]
@ -1586,6 +1586,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
checkInterruption(); checkInterruption();
// create a new loaded URL db entry // create a new loaded URL db entry
long ldate = System.currentTimeMillis();
indexURLEntry newEntry = wordIndex.loadedURL.newEntry( indexURLEntry newEntry = wordIndex.loadedURL.newEntry(
entry.url(), // URL entry.url(), // URL
docDescription, // document description docDescription, // document description
@ -1594,7 +1595,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
"", // ETag "", // ETag
docDate, // modification date docDate, // modification date
new Date(), // loaded date new Date(), // loaded date
new Date(), // freshdate new Date(ldate + Math.max(0, ldate - docDate.getTime()) / 2), // freshdate, computed with Proxy-TTL formula
referrerUrlHash, // referer hash referrerUrlHash, // referer hash
new byte[0], // md5 new byte[0], // md5
(int) entry.size(), // size (int) entry.size(), // size
@ -1704,7 +1705,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
newEntry.size(), newEntry.size(),
docDate.getTime(), docDate.getTime(),
System.currentTimeMillis(), System.currentTimeMillis(),
condenser.RESULT_WORD_ENTROPHY,
language, language,
doctype, doctype,
ioLinks[0].intValue(), ioLinks[0].intValue(),
@ -1749,7 +1749,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
} }
tmpContainers = null; tmpContainers = null;
} } //end: SEND PAGE INDEX TO STORAGE PEER
storageEndTime = System.currentTimeMillis(); storageEndTime = System.currentTimeMillis();
//increment number of indexed urls //increment number of indexed urls
@ -2253,7 +2254,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// get the word set // get the word set
Set words = null; Set words = null;
try { try {
words = new plasmaCondenser(document).words().keySet(); words = new plasmaCondenser(document, true).words().keySet();
} catch (UnsupportedEncodingException e) { } catch (UnsupportedEncodingException e) {
e.printStackTrace(); e.printStackTrace();
} }

@ -251,22 +251,21 @@ public final class plasmaWordIndex implements indexRI {
// this is called by the switchboard to put in a new page into the index // this is called by the switchboard to put in a new page into the index
// use all the words in one condenser object to simultanous create index entries // use all the words in one condenser object to simultanous create index entries
// iterate over all words int wordCount = 0;
int urlLength = url.toString().length();
int urlComps = htmlFilterContentScraper.urlComps(url.toString()).length;
// iterate over all words of context text
Iterator i = condenser.words().entrySet().iterator(); Iterator i = condenser.words().entrySet().iterator();
Map.Entry wentry; Map.Entry wentry;
String word; String word;
indexRWIEntry ientry; indexRWIEntry ientry;
plasmaCondenser.wordStatProp wprop; plasmaCondenser.wordStatProp wprop;
String wordHash;
int urlLength = url.toString().length();
int urlComps = htmlFilterContentScraper.urlComps(url.toString()).length;
while (i.hasNext()) { while (i.hasNext()) {
wentry = (Map.Entry) i.next(); wentry = (Map.Entry) i.next();
word = (String) wentry.getKey(); word = (String) wentry.getKey();
wprop = (plasmaCondenser.wordStatProp) wentry.getValue(); wprop = (plasmaCondenser.wordStatProp) wentry.getValue();
// if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c); assert (wprop.flags != null);
wordHash = plasmaCondenser.word2hash(word);
ientry = new indexRWIEntryNew(urlHash, ientry = new indexRWIEntryNew(urlHash,
urlLength, urlComps, (document == null) ? urlLength : document.getMainLongTitle().length(), urlLength, urlComps, (document == null) ? urlLength : document.getMainLongTitle().length(),
wprop.count, wprop.count,
@ -279,16 +278,15 @@ public final class plasmaWordIndex implements indexRI {
size, size,
urlModified.getTime(), urlModified.getTime(),
System.currentTimeMillis(), System.currentTimeMillis(),
condenser.RESULT_WORD_ENTROPHY,
language, language,
doctype, doctype,
outlinksSame, outlinksOther, outlinksSame, outlinksOther,
condenser.RESULT_FLAGS); wprop.flags);
addEntry(wordHash, ientry, System.currentTimeMillis(), false); addEntry(plasmaCondenser.word2hash(word), ientry, System.currentTimeMillis(), false);
wordCount++;
} }
// System.out.println("DEBUG: plasmaSearch.addPageIndex: added " +
// condenser.getWords().size() + " words, flushed " + c + " entries"); return wordCount;
return condenser.RESULT_SIMI_WORDS;
} }
public indexContainer getContainer(String wordHash, Set urlselection, long maxTime) { public indexContainer getContainer(String wordHash, Set urlselection, long maxTime) {

Loading…
Cancel
Save