tried to add word position to index

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1377 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent f1cfee7703
commit 0371494010

@ -442,17 +442,20 @@ public class IndexControl_p {
final Iterator en = index.elements(true);
result.append("URL entries related to this word hash <span class=\"tt\">").append(keyhash).append("</span><br><br>");
result.append("<form action=\"IndexControl_p.html\" method=\"post\" enctype=\"multipart/form-data\">");
String us, uh;
String us;
String uh[] = new String[2];
int i = 0;
final TreeMap tm = new TreeMap();
plasmaWordIndexEntry xi;
while (en.hasNext()) {
uh = ((plasmaWordIndexEntry)en.next()).getUrlHash();
xi = (plasmaWordIndexEntry) en.next();
uh = new String[]{xi.getUrlHash(), Integer.toString(xi.posintext())};
try {
us = switchboard.urlPool.loadedURL.getEntry(uh).url().toString();
us = switchboard.urlPool.loadedURL.getEntry(uh[0]).url().toString();
tm.put(us, uh);
} catch (IOException e) {
tm.put(uh, uh);
tm.put(uh[0], uh);
}
}
@ -460,15 +463,15 @@ public class IndexControl_p {
result.ensureCapacity((tm.size() + 2) * 384);
while (iter.hasNext()) {
us = iter.next().toString();
uh = (String)tm.get(us);
result.append("<input type=\"checkbox\" name=\"urlhx").append(i++).append("\" value=\"").append(uh).append("\" align=\"top\">");
if (us.equals(uh)) {
result.append("<span class=\"tt\">").append(uh).append("&nbsp;&lt;unresolved URL Hash&gt;</span><br>");
uh = (String[]) tm.get(us);
result.append("<input type=\"checkbox\" name=\"urlhx").append(i++).append("\" value=\"").append(uh[0]).append("\" align=\"top\">");
if (us.equals(uh[0])) {
result.append("<span class=\"tt\">").append(uh[0]).append("&nbsp;&lt;unresolved URL Hash&gt;</span><br>");
} else {
result.append("<a href=\"/IndexControl_p.html?").append("keystring=").append(keystring)
.append("&keyhash=").append(keyhash).append("&urlhash=").append(uh)
.append("&keyhash=").append(keyhash).append("&urlhash=").append(uh[0])
.append("&urlstringsearch=").append("&urlstring=").append(us).append("\" class=\"tt\">")
.append(uh).append("</a><span class=\"tt\">&nbsp;").append(us).append("</span><br>");
.append(uh[0]).append("</a><span class=\"tt\">&nbsp;").append(us).append(", pos=").append(uh[1]).append("</span><br>");
}
}
result.append("<input type=\"hidden\" name=\"keystring\" value=\"").append(keystring).append("\">")

@ -51,7 +51,7 @@ public class kelondroNaturalOrder extends kelondroAbstractOrder implements kelon
boolean asc;
public static final Comparator naturalOrder = new kelondroNaturalOrder(true);
public static final kelondroOrder naturalOrder = new kelondroNaturalOrder(true);
public kelondroNaturalOrder(boolean ascending) {
this.asc = ascending;
@ -79,6 +79,23 @@ public class kelondroNaturalOrder extends kelondroAbstractOrder implements kelon
return c;
}
public static byte[] encodeLong(long c, int length) {
byte[] b = new byte[length];
while (length > 0) {
b[--length] = (byte) (c & 0xFF);
c >>= 8;
}
return b;
}
public static long decodeLong(byte[] s) {
long c = 0;
int p = 0;
while ((p < 8) && (p < s.length)) c = (c << 8) | ((long) s[p++] & 0xFF);
return c;
}
// Compares its two arguments for order.
// Returns -1, 0, or 1 as the first argument
// is less than, equal to, or greater than the second.

@ -58,11 +58,11 @@ public class kelondroSplittedTree implements kelondroIndex {
private static File dbFile(File path, String filenameStub, int forkfactor, int columns, int number) {
String ns = Integer.toHexString(number).toUpperCase();
while (ns.length() < 2) ns = "0" + ns;
String fs = Integer.toHexString(forkfactor).toUpperCase();
while (fs.length() < 2) fs = "0" + fs;
String cs = Integer.toHexString(columns).toUpperCase();
while (cs.length() < 2) cs = "0" + cs;
return new File(path, filenameStub + "_" + ns + "." + fs + cs + ".ktf");
String ff = Integer.toHexString(forkfactor).toUpperCase();
while (ff.length() < 2) ff = "0" + ff;
String co = Integer.toHexString(columns).toUpperCase();
while (co.length() < 2) co = "0" + co;
return new File(path, filenameStub + "." + ff + "." + co + "." + ns + ".ktc");
}
private static boolean existsAll(File pathToFiles, String filenameStub, int forkfactor, int columns){

@ -116,20 +116,70 @@ public final class plasmaCondenser {
public int wordCount(String word) {
// number of occurrences of one word
// if the word did not occur, this simply returns 0
statProp sp = (statProp) words.get(word);
if (sp == null)
return 0;
wordStatProp sp = (wordStatProp) words.get(word);
if (sp == null) return 0;
return sp.count;
}
public static class statProp {
public int count;
public int wordPositionInText(String word) {
// position of word in text
// if unknown and word does not exist, the method returns 0
wordStatProp sp = (wordStatProp) words.get(word);
if (sp == null) return 0;
return sp.posInText;
}
public int wordPositionInPhrase(String word) {
// position of word in text
// if unknown and word does not exist, the method returns 0
wordStatProp sp = (wordStatProp) words.get(word);
if (sp == null) return 0;
return sp.posInPhrase;
}
public int wordNumberOfPhrase(String word) {
// position of word in text
// if unknown and word does not exist, the method returns 0
wordStatProp sp = (wordStatProp) words.get(word);
if (sp == null) return 0;
return sp.numOfPhrase;
}
public static class wordStatProp {
// object carries statistics for words and sentences
public int count; // number of occurrences
public int posInText; // unique handle, is initialized with word position (excluding double occurring words)
public int posInPhrase; //
public int numOfPhrase;
public HashSet hash; //
public wordStatProp(int handle, int pip, int nop) {
this.count = 1;
this.posInText = handle;
this.posInPhrase = pip;
this.numOfPhrase = nop;
this.hash = new HashSet();
}
public int handle;
public void inc() {
count++;
}
public HashSet hash;
public void check(int i) {
hash.add(Integer.toString(i));
}
public statProp(int handle) {
}
public static class phraseStatProp {
// object carries statistics for words and sentences
public int count; // number of occurrences
public int handle; // unique handle, is initialized with sentence counter
public HashSet hash; //
public phraseStatProp(int handle) {
this.count = 1;
this.handle = handle;
this.hash = new HashSet();
@ -145,6 +195,7 @@ public final class plasmaCondenser {
}
public String intString(int number, int length) {
String s = Integer.toString(number);
while (s.length() < length) s = "0" + s;
@ -160,13 +211,15 @@ public final class plasmaCondenser {
String word = "";
String k;
int wordlen;
statProp sp, sp1;
wordStatProp wsp, wsp1;
phraseStatProp psp;
int wordHandle;
int wordHandleCount = 0;
int sentenceHandleCount = 0;
int allwordcounter = 0;
int allsentencecounter = 0;
int idx;
int wordInSentenceCounter = 1;
Iterator it, it1;
// read source
@ -183,43 +236,45 @@ public final class plasmaCondenser {
sentence.insert(0, word); // append at beginning
if (sentences.containsKey(sentence)) {
// sentence already exists
sp = (statProp) sentences.get(sentence);
sp.inc();
idx = sp.handle;
sentences.put(sentence, sp);
psp = (phraseStatProp) sentences.get(sentence);
psp.inc();
idx = psp.handle;
sentences.put(sentence, psp);
} else {
// create new sentence
idx = sentenceHandleCount++;
sentences.put(sentence, new statProp(idx));
sentences.put(sentence, new phraseStatProp(idx));
}
// store to the words a link to this sentence
it = currsentwords.iterator();
while (it.hasNext()) {
k = (String) it.next();
sp = (statProp) words.get(k);
sp.check(idx);
words.put(k, sp);
wsp = (wordStatProp) words.get(k);
wsp.check(idx);
words.put(k, wsp);
}
}
sentence = new StringBuffer(100);
currsentwords.clear();
wordInSentenceCounter = 1;
} else {
// store word
allwordcounter++;
currsentwords.add(word);
if (words.containsKey(word)) {
// word already exists
sp = (statProp) words.get(word);
wordHandle = sp.handle;
sp.inc();
wsp = (wordStatProp) words.get(word);
wordHandle = wsp.posInText;
wsp.inc();
} else {
// word does not yet exist, create new word entry
wordHandle = wordHandleCount++;
sp = new statProp(wordHandle);
wsp = new wordStatProp(wordHandle, wordInSentenceCounter, sentences.size() + 1);
}
words.put(word, sp);
words.put(word, wsp);
// we now have the unique handle of the word, put it into the sentence:
sentence.append(intString(wordHandle, numlength));
wordInSentenceCounter++;
}
}
// finnish last sentence
@ -227,11 +282,11 @@ public final class plasmaCondenser {
allsentencecounter++;
sentence.insert(0, "."); // append at beginning
if (sentences.containsKey(sentence)) {
sp = (statProp) sentences.get(sentence);
sp.inc();
sentences.put(sentence, sp);
psp = (phraseStatProp) sentences.get(sentence);
psp.inc();
sentences.put(sentence, psp);
} else {
sentences.put(sentence, new statProp(sentenceHandleCount++));
sentences.put(sentence, new phraseStatProp(sentenceHandleCount++));
}
}
@ -251,14 +306,14 @@ public final class plasmaCondenser {
sentence = (StringBuffer) o;
wc = (sentence.length() - 1) / numlength;
s = new String[wc + 2];
sp = (statProp) sentences.get(sentence);
s[0] = intString(sp.count, numlength); // number of occurrences of this sentence
psp = (phraseStatProp) sentences.get(sentence);
s[0] = intString(psp.count, numlength); // number of occurrences of this sentence
s[1] = sentence.substring(0, 1); // the termination symbol of this sentence
for (int i = 0; i < wc; i++) {
k = sentence.substring(i * numlength + 1, (i + 1) * numlength + 1);
s[i + 2] = k;
}
orderedSentences[sp.handle] = s;
orderedSentences[psp.handle] = s;
}
}
@ -270,7 +325,7 @@ public final class plasmaCondenser {
entry = (Map.Entry) it.next();
word = (String) entry.getKey();
wordlen = word.length();
sp = (statProp) entry.getValue();
wsp = (wordStatProp) entry.getValue();
for (int i = wordcut; i > 0; i--) {
if (wordlen > i) {
k = word.substring(0, wordlen - i);
@ -278,20 +333,20 @@ public final class plasmaCondenser {
// we will delete the word 'word' and repoint the
// corresponding links
// in sentences that use this word
sp1 = (statProp) words.get(k);
it1 = sp.hash.iterator(); // we iterate over all sentences that refer to this word
wsp1 = (wordStatProp) words.get(k);
it1 = wsp.hash.iterator(); // we iterate over all sentences that refer to this word
while (it1.hasNext()) {
idx = Integer.parseInt((String) it1.next()); // number of a sentence
s = (String[]) orderedSentences[idx];
for (int j = 2; j < s.length; j++) {
if (s[j].equals(intString(sp.handle, numlength)))
s[j] = intString(sp1.handle, numlength);
if (s[j].equals(intString(wsp.posInText, numlength)))
s[j] = intString(wsp1.posInText, numlength);
}
orderedSentences[idx] = s;
}
// update word counter
sp1.count = sp1.count + sp.count;
words.put(k, sp1);
wsp1.count = wsp1.count + wsp.count;
words.put(k, wsp1);
// remove current word
it.remove();
continue wordsearch;
@ -311,16 +366,16 @@ public final class plasmaCondenser {
sentence.append(((String[]) orderedSentences[i])[j]);
if (sentences.containsKey(sentence)) {
// add sentence counter to counter of found sentence
sp = (statProp) sentences.get(sentence);
sp.count = sp.count + Integer.parseInt(((String[]) orderedSentences[i])[0]);
sentences.put(sentence, sp);
psp = (phraseStatProp) sentences.get(sentence);
psp.count = psp.count + Integer.parseInt(((String[]) orderedSentences[i])[0]);
sentences.put(sentence, psp);
// System.out.println("Found double occurring sentence " + i + "
// = " + sp.handle);
} else {
// create new sentence entry
sp = new statProp(i);
sp.count = Integer.parseInt(((String[]) orderedSentences[i])[0]);
sentences.put(sentence, sp);
psp = new phraseStatProp(i);
psp.count = Integer.parseInt(((String[]) orderedSentences[i])[0]);
sentences.put(sentence, psp);
}
}
@ -351,7 +406,7 @@ public final class plasmaCondenser {
// and order the entries by the number of the sentence
// this structure is only needed to reconstruct the text
String word;
statProp sp;
wordStatProp wsp;
Map.Entry entry;
Iterator it;
String[] orderedWords = new String[words.size() + 99]; // uuiiii, the '99' is only a quick hack...
@ -359,8 +414,8 @@ public final class plasmaCondenser {
while (it.hasNext()) {
entry = (Map.Entry) it.next();
word = (String) entry.getKey();
sp = (statProp) entry.getValue();
orderedWords[sp.handle] = word;
wsp = (wordStatProp) entry.getValue();
orderedWords[wsp.posInText] = word;
}
Object[] orderedSentences = makeOrderedSentences();
@ -388,7 +443,7 @@ public final class plasmaCondenser {
// this structure is needed to present the strings in the right order in a printout
int wc;
Iterator it;
statProp sp;
phraseStatProp psp;
String[] s;
StringBuffer sentence;
Object[] orderedSentences = new Object[sentences.size()];
@ -399,12 +454,12 @@ public final class plasmaCondenser {
sentence = (StringBuffer) it.next();
wc = (sentence.length() - 1) / numlength;
s = new String[wc + 2];
sp = (statProp) sentences.get(sentence);
s[0] = intString(sp.count, numlength); // number of occurrences of this sentence
psp = (phraseStatProp) sentences.get(sentence);
s[0] = intString(psp.count, numlength); // number of occurrences of this sentence
s[1] = sentence.substring(0, 1); // the termination symbol of this sentence
for (int i = 0; i < wc; i++)
s[i + 2] = sentence.substring(i * numlength + 1, (i + 1) * numlength + 1);
orderedSentences[sp.handle] = s;
orderedSentences[psp.handle] = s;
}
return orderedSentences;
}
@ -414,7 +469,7 @@ public final class plasmaCondenser {
String k;
String word;
Iterator it;
statProp sp;
wordStatProp wsp;
Object[] orderedSentences = makeOrderedSentences();
@ -426,8 +481,8 @@ public final class plasmaCondenser {
while (it.hasNext()) {
entry = (Map.Entry) it.next();
word = (String) entry.getKey();
sp = (statProp) entry.getValue();
sortedWords.put(intString(sp.count, numlength) + intString(sp.handle, numlength), word);
wsp = (wordStatProp) entry.getValue();
sortedWords.put(intString(wsp.count, numlength) + intString(wsp.posInText, numlength), word);
}
// start writing of words and sentences

@ -1316,14 +1316,17 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// iterate over all words
Iterator i = condenser.getWords().iterator();
int p = 0;
while (i.hasNext()) {
String word = (String) i.next();
int count = condenser.wordCount(word);
String wordHash = plasmaWordIndexEntry.word2hash(word);
plasmaWordIndexEntity wordIdxEntity = new plasmaWordIndexEntity(wordHash);
plasmaWordIndexEntry wordIdxEntry = new plasmaWordIndexEntry(urlHash, count, p++, 0, 0,
plasmaWordIndex.microDateDays(docDate), quality, language, doctype, true);
plasmaWordIndexEntry wordIdxEntry = new plasmaWordIndexEntry(urlHash,
condenser.wordCount(word),
condenser.wordPositionInText(word),
condenser.wordPositionInPhrase(word),
condenser.wordNumberOfPhrase(word),
docDate.getTime(),
quality, language, doctype, true);
wordIdxEntity.addEntry(wordIdxEntry);
tmpEntities.add(wordIdxEntity);
// wordIndex.addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry));

@ -109,55 +109,65 @@ public final class plasmaWordIndex {
private static final int day = 86400000;
public static int microDateDays(Date modified) {
// this calculates a virtual age from a given date
// the purpose is to have an age in days of a given modified date
// from a fixed standpoint in the past
// one day has 60*60*24 seconds = 86400 seconds
// we take mod 64**3 = 262144, this is the mask of the storage
return (int) ((modified.getTime() / day) % 262144);
return microDateDays(modified.getTime());
}
public static int microDateDays(long modified) {
// this calculates a virtual age from a given date
// the purpose is to have an age in days of a given modified date
// from a fixed standpoint in the past
// one day has 60*60*24 seconds = 86400 seconds
// we take mod 64**3 = 262144, this is the mask of the storage
return (int) ((modified / day) % 262144);
}
public static String microDateHoursStr(long time) {
return kelondroBase64Order.enhancedCoder.encodeLong(microDateHoursInt(time), 3);
return kelondroBase64Order.enhancedCoder.encodeLong(microDateHoursInt(time), 3);
}
public static int microDateHoursInt(long time) {
return (int) ((time / hour) % 262144);
return (int) ((time / hour) % 262144);
}
public static int microDateHoursAge(String mdhs) {
return microDateHoursInt(System.currentTimeMillis()) - (int) kelondroBase64Order.enhancedCoder.decodeLong(mdhs);
}
public int addPageIndex(URL url, String urlHash, Date urlModified, plasmaCondenser condenser,
String language, char doctype) {
public static long reverseMicroDateDays(int microDateDays) {
return ((long) microDateDays) * ((long) day);
}
public int addPageIndex(URL url, String urlHash, Date urlModified, plasmaCondenser condenser, String language, char doctype) {
// this is called by the switchboard to put in a new page into the index
// use all the words in one condenser object to simultanous create index entries
int age = microDateDays(urlModified);
int quality = 0;
try {
quality = condenser.RESULT_INFORMATION_VALUE;
} catch (NumberFormatException e) {
System.out.println("INTERNAL ERROR WITH CONDENSER.INFORMATION_VALUE: " + e.toString() + ": in URL " + url.toString());
}
// use all the words in one condenser object to simultanous create index
// entries
// int age = microDateDays(urlModified);
int quality = 0;
try {
quality = condenser.RESULT_INFORMATION_VALUE;
} catch (NumberFormatException e) {
System.out.println("INTERNAL ERROR WITH CONDENSER.INFORMATION_VALUE: " + e.toString() + ": in URL " + url.toString());
}
// iterate over all words
Iterator i = condenser.getWords().iterator();
String word;
int count;
plasmaWordIndexEntry entry;
String wordHash;
int p = 0;
while (i.hasNext()) {
word = (String) i.next();
count = condenser.wordCount(word);
//if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ": " + c);
wordHash = plasmaWordIndexEntry.word2hash(word);
entry = new plasmaWordIndexEntry(urlHash, count, p++, 0, 0,
age, quality, language, doctype, true);
addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry), false);
}
//System.out.println("DEBUG: plasmaSearch.addPageIndex: added " + condenser.getWords().size() + " words, flushed " + c + " entries");
Iterator i = condenser.getWords().iterator();
String word;
plasmaWordIndexEntry entry;
String wordHash;
while (i.hasNext()) {
word = (String) i.next();
// if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c);
wordHash = plasmaWordIndexEntry.word2hash(word);
entry = new plasmaWordIndexEntry(urlHash,
condenser.wordCount(word),
condenser.wordPositionInText(word),
condenser.wordPositionInPhrase(word),
condenser.wordNumberOfPhrase(word),
urlModified.getTime(), quality, language, doctype, true);
addEntries(plasmaWordIndexEntryContainer.instantContainer(wordHash, System.currentTimeMillis(), entry), false);
}
// System.out.println("DEBUG: plasmaSearch.addPageIndex: added " +
// condenser.getWords().size() + " words, flushed " + c + " entries");
return condenser.getWords().size();
}
@ -409,12 +419,15 @@ public final class plasmaWordIndex {
}
public static void main(String[] args) {
// System.out.println(kelondroMSetTools.fastStringComparator(true).compare("RwGeoUdyDQ0Y", "rwGeoUdyDQ0Y"));
// System.out.println(kelondroMSetTools.fastStringComparator(true).compare("RwGeoUdyDQ0Y", "rwGeoUdyDQ0Y"));
// System.out.println(new Date(reverseMicroDateDays(microDateDays(System.currentTimeMillis()))));
plasmaWordIndex index = new plasmaWordIndex(new File("D:\\dev\\proxy\\DATA\\PLASMADB"), 555, new serverLog("TESTAPP"));
Iterator iter = index.wordHashes("5A8yhZMh_Kmv", true, true);
while (iter.hasNext()) {
System.out.println("File: " + (String) iter.next());
}
}
}

@ -138,7 +138,7 @@ public final class plasmaWordIndexAssortment {
for (int i = 0; i < assortmentLength; i++) {
entry = (plasmaWordIndexEntry) entries.next();
row[3 + 2 * i] = entry.getUrlHash().getBytes();
row[4 + 2 * i] = entry.toEncodedForm(true).getBytes();
row[4 + 2 * i] = entry.toEncodedForm(1).getBytes();
}
byte[][] oldrow = null;
try {

@ -153,7 +153,7 @@ public final class plasmaWordIndexCache implements plasmaWordIndexInterface {
row[1] = kelondroRecords.long2bytes(container.size(), 4);
row[2] = kelondroRecords.long2bytes(updateTime, 8);
row[3] = wordEntry.getUrlHash().getBytes();
row[4] = wordEntry.toEncodedForm(true).getBytes();
row[4] = wordEntry.toEncodedForm(1).getBytes();
dumpArray.set((int) urlcount++, row);
}
}

@ -168,7 +168,7 @@ public final class plasmaWordIndexEntity {
public boolean addEntry(plasmaWordIndexEntry entry) throws IOException {
if (entry == null) return false;
if (theTmpMap == null) {
return (theIndex.put(entry.getUrlHash().getBytes(), entry.toEncodedForm(false).getBytes()) == null);
return (theIndex.put(entry.getUrlHash().getBytes(), entry.toEncodedForm(0).getBytes()) == null);
} else {
return (theTmpMap.put(entry.getUrlHash(), entry) == null);
}

@ -78,7 +78,7 @@ public final class plasmaWordIndexEntry {
private int posintext; // first position of the word in text as number of word; 0=unknown or irrelevant position
private int posinphrase; // position within a phrase of the word
private int posofphrase; // position of the phrase in the text as count of sentences; 0=unknown; 1=path; 2=keywords; 3=headline; >4: in text
private int age; // calculated by using last-modified
private long lastModified;// calculated by using last-modified
private int quality; // result of a heuristic on the source file
private byte[] language; // essentially the country code (the TLD as heuristic), two letters lowercase only
private char doctype; // type of source
@ -186,17 +186,36 @@ public final class plasmaWordIndexEntry {
// the class instantiation can only be done by a plasmaStore method
// therefore they are all public
public plasmaWordIndexEntry(String urlHash, int count, int posintext, int posinphrase, int posofphrase, int virtualage, int quality, String language, char doctype, boolean local) {
// ** hier fehlt noch als Attribut: <Wortposition im Text>, damit 'nearby' getrackt werden kann **
public plasmaWordIndexEntry(String urlHash,
int count, // how often appears this word in the text
int posintext,
int posinphrase,
int posofphrase,
long time,
int quality,
String language,
char doctype,
boolean local) {
// more needed attributes:
// - int: length of text / total number of words
// - int: length of text / total number of sentences
// - long: update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
// - int: word distance; this is 0 by default, and set to the difference of posintext from two indexes if these are combined (simultanous search). If stored, this shows that the result was obtained by remote search
// - char: category of appearance (header, title, section, text, anchor-descr, image-tag etc)
// - boolean: appears in title, appears in header, appears in ....
// - int: url-length (shorter are better)
// - int: url-number of components / length of path
// - int: length of description tag / title tag (longer are better)
// - int: number of chapters
if ((language == null) || (language.length() != plasmaURL.urlLanguageLength)) language = "uk";
this.urlHash = urlHash;
this.count = count;
this.posintext = posintext;
this.posinphrase = posinphrase;
this.posofphrase = posofphrase;
this.age = virtualage;
this.lastModified = time;
this.quality = quality;
this.language = language.getBytes();
this.doctype = doctype;
@ -210,7 +229,7 @@ public final class plasmaWordIndexEntry {
this.posintext = (code.length() >= 14) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(12, 14)) : 0;
this.posinphrase = (code.length() >= 15) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(14, 16)) : 0;
this.posofphrase = (code.length() >= 16) ? (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(16, 18)) : 0;
this.age = (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(3, 6));
this.lastModified = plasmaWordIndex.reverseMicroDateDays((int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(3, 6)));
this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(code.substring(0, 3));
this.language = code.substring(8, 10).getBytes();
this.doctype = code.charAt(10);
@ -231,57 +250,31 @@ public final class plasmaWordIndexEntry {
this.posintext = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("t", "__"));
this.posinphrase = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("r", "__"));
this.posofphrase = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("o", "__"));
this.age = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("a", "A"));
this.lastModified = plasmaWordIndex.reverseMicroDateDays((int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("a", "A")));
this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("q", "__"));
this.language = pr.getProperty("l", "uk").getBytes();
this.doctype = pr.getProperty("d", "u").charAt(0);
this.localflag = pr.getProperty("f", ""+LT_LOCAL).charAt(0);
}
private String b64save(long x, int l) {
try {
return kelondroBase64Order.enhancedCoder.encodeLong(x, l);
} catch (Exception e) {
// if x does not fit into l
return "________".substring(0, l);
}
}
public String toEncodedForm(boolean longAttr) {
public String toEncodedForm(int outputFormat) {
// attention: this integrates NOT the URL into the encoding
// if you need a complete dump, use toExternalForm()
StringBuffer buf = new StringBuffer(longAttr?18:12);
StringBuffer buf = new StringBuffer((outputFormat >= 1) ? 18 : 12);
buf.append(b64save(this.quality, plasmaURL.urlQualityLength))
.append(b64save(this.age, 3))
.append(b64save(this.count, 2))
buf.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.quality, plasmaURL.urlQualityLength))
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(plasmaWordIndex.microDateDays(this.lastModified), 3))
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.count, 2))
.append(new String(this.language))
.append(this.doctype)
.append(this.localflag); // 3 + 3 + 2 + 2 + 1 + 1 = 12 bytes
if (longAttr)
buf.append(b64save(this.posintext, 2))
.append(b64save(this.posinphrase, 2))
.append(b64save(this.posofphrase, 2));
if (outputFormat >= 1)
buf.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posintext, 2))
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posinphrase, 2))
.append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posofphrase, 2));
return buf.toString();
// String shortAttr =
// b64save(quality, plasmaCrawlLURL.urlQualityLength) +
// b64save(age, 3) +
// b64save(count, 2) +
// new String(language) +
// doctype +
// localflag; // 3 + 3 + 2 + 2 + 1 + 1 = 12 bytes
// if (longAttr)
// return
// shortAttr +
// b64save(posintext, 2) +
// b64save(posinphrase, 2) +
// b64save(posofphrase, 2);
// // 12 + 3 + 2 + 2 + 1 + 1 = 12 bytes
// else
// return shortAttr;
}
public String toExternalForm() {
@ -289,15 +282,15 @@ public final class plasmaWordIndexEntry {
str.append("{")
.append("h=").append(this.urlHash)
.append(",q=").append(b64save(this.quality, plasmaURL.urlQualityLength))
.append(",a=").append(b64save(this.age, 3))
.append(",c=").append(b64save(this.count, 2))
.append(",q=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.quality, plasmaURL.urlQualityLength))
.append(",a=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(plasmaWordIndex.microDateDays(this.lastModified), 3))
.append(",c=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.count, 2))
.append(",l=").append(new String(this.language))
.append(",d=").append(this.doctype)
.append(",f=").append(this.localflag)
.append(",t=").append(b64save(this.posintext, 2))
.append(",r=").append(b64save(this.posinphrase, 2))
.append(",o=").append(b64save(this.posofphrase, 2))
.append(",t=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posintext, 2))
.append(",r=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posinphrase, 2))
.append(",o=").append(kelondroBase64Order.enhancedCoder.encodeLongSmart(this.posofphrase, 2))
.append("}");
return str.toString();
@ -312,7 +305,11 @@ public final class plasmaWordIndexEntry {
}
public int getVirtualAge() {
return age;
return plasmaWordIndex.microDateDays(lastModified);
}
public long getLastModified() {
return lastModified;
}
public int getCount() {

Loading…
Cancel
Save