start with refactoring of index management

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2110 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 63a08307c2
commit a474669338

@ -57,6 +57,7 @@ import java.util.TreeMap;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpHeader;
import de.anomic.index.indexEntryAttribute;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL;
@ -167,7 +168,7 @@ public class IndexControl_p {
switchboard.wordIndex.deleteIndex(keyhash);
post.remove("keyhashdeleteall");
if (keystring.length() > 0 &&
plasmaWordIndexEntry.word2hash(keystring).equals(keyhash)) {
indexEntryAttribute.word2hash(keystring).equals(keyhash)) {
post.put("keystringsearch", "generated");
} else {
post.put("keyhashsearch", "generated");
@ -189,7 +190,7 @@ public class IndexControl_p {
// thinks that it was called for a list presentation
post.remove("keyhashdelete");
if (keystring.length() > 0 &&
plasmaWordIndexEntry.word2hash(keystring).equals(keyhash)) {
indexEntryAttribute.word2hash(keystring).equals(keyhash)) {
post.put("keystringsearch", "generated");
} else {
post.put("keyhashsearch", "generated");
@ -220,7 +221,7 @@ public class IndexControl_p {
}
if (post.containsKey("keystringsearch")) {
keyhash = plasmaWordIndexEntry.word2hash(keystring);
keyhash = indexEntryAttribute.word2hash(keystring);
prop.put("keyhash", keyhash);
prop.put("urlstring", "");
prop.put("urlhash", "");
@ -229,7 +230,7 @@ public class IndexControl_p {
if (post.containsKey("keyhashsearch")) {
if (keystring.length() == 0 ||
!plasmaWordIndexEntry.word2hash(keystring).equals(keyhash)) {
!indexEntryAttribute.word2hash(keystring).equals(keyhash)) {
prop.put("keystring", "<not possible to compute word from hash>");
}
prop.put("urlstring", "");
@ -240,7 +241,7 @@ public class IndexControl_p {
// transfer to other peer
if (post.containsKey("keyhashtransfer")) {
if (keystring.length() == 0 ||
!plasmaWordIndexEntry.word2hash(keystring).equals(keyhash)) {
!indexEntryAttribute.word2hash(keystring).equals(keyhash)) {
prop.put("keystring", "<not possible to compute word from hash>");
}
prop.put("urlstring", "");

@ -58,12 +58,12 @@ import java.util.Map;
import java.net.InetAddress;
import java.net.UnknownHostException;
import de.anomic.http.httpHeader;
import de.anomic.index.indexEntryAttribute;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.plasmaWordIndexEntry;
import de.anomic.server.serverCodings;
import de.anomic.server.serverCore;
import de.anomic.server.serverFileUtils;
@ -463,7 +463,7 @@ public class dir {
false, /*localneed*/
condenser.RESULT_WORD_ENTROPHY,
"**", /*language*/
plasmaWordIndexEntry.DT_SHARE, /*doctype*/
indexEntryAttribute.DT_SHARE, /*doctype*/
phrase.length(), /*size*/
condenser.RESULT_NUMB_WORDS
);
@ -476,7 +476,7 @@ public class dir {
);
final String urlHash = newEntry.hash();
/*final int words =*/ switchboard.wordIndex.addPageIndex(url, urlHash, new Date(), phrase.length() + descr.length() + 13, null, condenser, "**", plasmaWordIndexEntry.DT_SHARE, 0, 0);
/*final int words =*/ switchboard.wordIndex.addPageIndex(url, urlHash, new Date(), phrase.length() + descr.length() + 13, null, condenser, "**", indexEntryAttribute.DT_SHARE, 0, 0);
} catch (IOException e) {}
}
@ -487,7 +487,7 @@ public class dir {
Map.Entry entry;
while (words.hasNext()) {
entry = (Map.Entry) words.next();
switchboard.wordIndex.removeEntries(plasmaWordIndexEntry.word2hash((String) entry.getKey()), new String[] {urlhash}, true);
switchboard.wordIndex.removeEntries(indexEntryAttribute.word2hash((String) entry.getKey()), new String[] {urlhash}, true);
}
switchboard.urlPool.loadedURL.remove(urlhash);
} catch (Exception e) {

@ -49,6 +49,7 @@
import java.util.HashSet;
import de.anomic.http.httpHeader;
import de.anomic.index.indexEntryAttribute;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchRankingProfile;
@ -56,7 +57,6 @@ import de.anomic.plasma.plasmaSearchResult;
import de.anomic.plasma.plasmaSearchTimingProfile;
import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndexEntry;
import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.server.serverCore;
import de.anomic.server.serverObjects;
@ -100,9 +100,9 @@ public final class search {
}
// prepare search
final HashSet keyhashes = new HashSet(query.length() / plasmaWordIndexEntry.wordHashLength);
for (int i = 0; i < (query.length() / plasmaWordIndexEntry.wordHashLength); i++) {
keyhashes.add(query.substring(i * plasmaWordIndexEntry.wordHashLength, (i + 1) * plasmaWordIndexEntry.wordHashLength));
final HashSet keyhashes = new HashSet(query.length() / indexEntryAttribute.wordHashLength);
for (int i = 0; i < (query.length() / indexEntryAttribute.wordHashLength); i++) {
keyhashes.add(query.substring(i * indexEntryAttribute.wordHashLength, (i + 1) * indexEntryAttribute.wordHashLength));
}
final long timestamp = System.currentTimeMillis();

@ -69,11 +69,11 @@ import org.w3c.dom.NodeList;
import org.xml.sax.SAXException;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterOutputStream;
import de.anomic.index.indexEntryAttribute;
import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMap;
import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.plasmaWordIndexEntry;
import de.anomic.server.serverFileUtils;
import de.anomic.server.logging.serverLog;
@ -85,7 +85,7 @@ public class bookmarksDB {
HashMap bookmarkCache;
public static String tagHash(String tagName){
return plasmaWordIndexEntry.word2hash(tagName.toLowerCase());
return indexEntryAttribute.word2hash(tagName.toLowerCase());
}
public static String dateToiso8601(Date date){
return new SimpleDateFormat("yyyy-MM-dd").format(date)+"T"+(new SimpleDateFormat("HH:mm:ss")).format(date)+"Z";

@ -0,0 +1,172 @@
// indexEntryAttribute.java
// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 16.05.2006 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.index;
import java.net.URL;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.server.serverCodings;
import de.anomic.yacy.yacySeedDB;
public class indexEntryAttribute {
// the size of a word hash
public static final int wordHashLength = yacySeedDB.commonHashLength; // 12
public static final int urlHashLength = yacySeedDB.commonHashLength; // 12
// doctypes:
public static final char DT_PDFPS = 'p';
public static final char DT_TEXT = 't';
public static final char DT_HTML = 'h';
public static final char DT_DOC = 'd';
public static final char DT_IMAGE = 'i';
public static final char DT_MOVIE = 'm';
public static final char DT_FLASH = 'f';
public static final char DT_SHARE = 's';
public static final char DT_AUDIO = 'a';
public static final char DT_BINARY = 'b';
public static final char DT_UNKNOWN = 'u';
// appearance locations: (used for flags)
public static final int AP_TITLE = 0; // title tag from html header
public static final int AP_H1 = 1; // headline - top level
public static final int AP_H2 = 2; // headline, second level
public static final int AP_H3 = 3; // headline, 3rd level
public static final int AP_H4 = 4; // headline, 4th level
public static final int AP_H5 = 5; // headline, 5th level
public static final int AP_H6 = 6; // headline, 6th level
public static final int AP_TEXT = 7; // word appears in text (used to check validation of other appearances against spam)
public static final int AP_DOM = 8; // word inside an url: in Domain
public static final int AP_PATH = 9; // word inside an url: in path
public static final int AP_IMG = 10; // tag inside image references
public static final int AP_ANCHOR = 11; // anchor description
public static final int AP_ENV = 12; // word appears in environment (similar to anchor appearance)
public static final int AP_BOLD = 13; // may be interpreted as emphasized
public static final int AP_ITALICS = 14; // may be interpreted as emphasized
public static final int AP_WEAK = 15; // for Text that is small or bareley visible
public static final int AP_INVISIBLE = 16; // good for spam detection
public static final int AP_TAG = 17; // for tagged indexeing (i.e. using mp3 tags)
public static final int AP_AUTHOR = 18; // word appears in author name
public static final int AP_OPUS = 19; // word appears in name of opus, which may be an album name (in mp3 tags)
public static final int AP_TRACK = 20; // word appears in track name (i.e. in mp3 tags)
// URL attributes
public static final int UA_LOCAL = 0; // URL was crawled locally
public static final int UA_TILDE = 1; // tilde appears in URL
public static final int UA_REDIRECT = 2; // The URL is a redirection
// local flag attributes
public static final char LT_LOCAL = 'L';
public static final char LT_GLOBAL = 'G';
// create a word hash
public static String word2hash(String word) {
return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase())).substring(0, indexEntryAttribute.wordHashLength);
}
// doctype calculation
public static char docType(URL url) {
String path = htmlFilterContentScraper.urlNormalform(url);
// serverLog.logFinest("PLASMA", "docType URL=" + path);
char doctype = doctype = indexEntryAttribute.DT_UNKNOWN;
if (path.endsWith(".gif")) { doctype = indexEntryAttribute.DT_IMAGE; }
else if (path.endsWith(".jpg")) { doctype = indexEntryAttribute.DT_IMAGE; }
else if (path.endsWith(".jpeg")) { doctype = indexEntryAttribute.DT_IMAGE; }
else if (path.endsWith(".png")) { doctype = indexEntryAttribute.DT_IMAGE; }
else if (path.endsWith(".html")) { doctype = indexEntryAttribute.DT_HTML; }
else if (path.endsWith(".txt")) { doctype = indexEntryAttribute.DT_TEXT; }
else if (path.endsWith(".doc")) { doctype = indexEntryAttribute.DT_DOC; }
else if (path.endsWith(".rtf")) { doctype = indexEntryAttribute.DT_DOC; }
else if (path.endsWith(".pdf")) { doctype = indexEntryAttribute.DT_PDFPS; }
else if (path.endsWith(".ps")) { doctype = indexEntryAttribute.DT_PDFPS; }
else if (path.endsWith(".avi")) { doctype = indexEntryAttribute.DT_MOVIE; }
else if (path.endsWith(".mov")) { doctype = indexEntryAttribute.DT_MOVIE; }
else if (path.endsWith(".qt")) { doctype = indexEntryAttribute.DT_MOVIE; }
else if (path.endsWith(".mpg")) { doctype = indexEntryAttribute.DT_MOVIE; }
else if (path.endsWith(".md5")) { doctype = indexEntryAttribute.DT_SHARE; }
else if (path.endsWith(".mpeg")) { doctype = indexEntryAttribute.DT_MOVIE; }
else if (path.endsWith(".asf")) { doctype = indexEntryAttribute.DT_FLASH; }
return doctype;
}
public static char docType(String mime) {
// serverLog.logFinest("PLASMA", "docType mime=" + mime);
char doctype = indexEntryAttribute.DT_UNKNOWN;
if (mime == null) doctype = indexEntryAttribute.DT_UNKNOWN;
else if (mime.startsWith("image/")) doctype = indexEntryAttribute.DT_IMAGE;
else if (mime.endsWith("/gif")) doctype = indexEntryAttribute.DT_IMAGE;
else if (mime.endsWith("/jpeg")) doctype = indexEntryAttribute.DT_IMAGE;
else if (mime.endsWith("/png")) doctype = indexEntryAttribute.DT_IMAGE;
else if (mime.endsWith("/html")) doctype = indexEntryAttribute.DT_HTML;
else if (mime.endsWith("/rtf")) doctype = indexEntryAttribute.DT_DOC;
else if (mime.endsWith("/pdf")) doctype = indexEntryAttribute.DT_PDFPS;
else if (mime.endsWith("/octet-stream")) doctype = indexEntryAttribute.DT_BINARY;
else if (mime.endsWith("/x-shockwave-flash")) doctype = indexEntryAttribute.DT_FLASH;
else if (mime.endsWith("/msword")) doctype = indexEntryAttribute.DT_DOC;
else if (mime.endsWith("/mspowerpoint")) doctype = indexEntryAttribute.DT_DOC;
else if (mime.endsWith("/postscript")) doctype = indexEntryAttribute.DT_PDFPS;
else if (mime.startsWith("text/")) doctype = indexEntryAttribute.DT_TEXT;
else if (mime.startsWith("image/")) doctype = indexEntryAttribute.DT_IMAGE;
else if (mime.startsWith("audio/")) doctype = indexEntryAttribute.DT_AUDIO;
else if (mime.startsWith("video/")) doctype = indexEntryAttribute.DT_MOVIE;
//bz2 = application/x-bzip2
//dvi = application/x-dvi
//gz = application/gzip
//hqx = application/mac-binhex40
//lha = application/x-lzh
//lzh = application/x-lzh
//pac = application/x-ns-proxy-autoconfig
//php = application/x-httpd-php
//phtml = application/x-httpd-php
//rss = application/xml
//tar = application/tar
//tex = application/x-tex
//tgz = application/tar
//torrent = application/x-bittorrent
//xhtml = application/xhtml+xml
//xla = application/msexcel
//xls = application/msexcel
//xsl = application/xml
//xml = application/xml
//Z = application/x-compress
//zip = application/zip
return doctype;
}
// language calculation
public static String language(URL url) {
String language = "uk";
String host = url.getHost();
int pos = host.lastIndexOf(".");
if ((pos > 0) && (host.length() - pos == 3)) language = host.substring(pos + 1).toLowerCase();
return language;
}
}

@ -56,6 +56,7 @@ package de.anomic.plasma;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpc;
import de.anomic.http.httpHeader;
import de.anomic.index.indexEntryAttribute;
import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroMap;
import de.anomic.kelondro.kelondroMScoreCluster;
@ -761,9 +762,9 @@ public final class plasmaHTCache {
this.lastModified = responseHeader.lastModified();
if (this.lastModified == null) this.lastModified = new Date(serverDate.correctedUTCTime()); // does not exist in header
}
this.doctype = plasmaWordIndexEntry.docType(responseHeader.mime());
if (this.doctype == plasmaWordIndexEntry.DT_UNKNOWN) this.doctype = plasmaWordIndexEntry.docType(url);
this.language = plasmaWordIndexEntry.language(url);
this.doctype = indexEntryAttribute.docType(responseHeader.mime());
if (this.doctype == indexEntryAttribute.DT_UNKNOWN) this.doctype = indexEntryAttribute.docType(url);
this.language = indexEntryAttribute.language(url);
// to be defined later:
this.cacheArray = null;

@ -47,6 +47,7 @@ import java.util.TreeSet;
import java.util.Iterator;
import de.anomic.htmlFilter.htmlFilterAbstractScraper;
import de.anomic.index.indexEntryAttribute;
import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.server.serverByteBuffer;
@ -100,14 +101,14 @@ public final class plasmaSearchQuery {
public static Set words2hashes(String[] words) {
TreeSet hashes = new TreeSet();
for (int i = 0; i < words.length; i++) hashes.add(plasmaWordIndexEntry.word2hash(words[i]));
for (int i = 0; i < words.length; i++) hashes.add(indexEntryAttribute.word2hash(words[i]));
return hashes;
}
public static Set words2hashes(Set words) {
Iterator i = words.iterator();
TreeSet hashes = new TreeSet();
while (i.hasNext()) hashes.add(plasmaWordIndexEntry.word2hash((String) i.next()));
while (i.hasNext()) hashes.add(indexEntryAttribute.word2hash((String) i.next()));
return hashes;
}

@ -55,6 +55,7 @@ import java.net.MalformedURLException;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.server.serverCodings;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.index.indexEntryAttribute;
public final class plasmaSearchResult {
@ -249,7 +250,7 @@ public final class plasmaSearchResult {
word = words[i].toLowerCase();
if ((word.length() > 2) &&
("http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0) &&
(!(query.queryHashes.contains(plasmaWordIndexEntry.word2hash(word)))))
(!(query.queryHashes.contains(indexEntryAttribute.word2hash(word)))))
ref.incScore(word);
}
}

@ -55,6 +55,7 @@ import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySearch;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.index.indexEntryAttribute;
public class plasmaSnippetCache {
@ -125,7 +126,7 @@ public class plasmaSnippetCache {
while (i.hasNext()) {
h = (String) i.next();
for (int j = 0; j < w.length; j++) {
if (plasmaWordIndexEntry.word2hash(w[j]).equals(h)) w[j] = "<b>" + w[j] + "</b>";
if (indexEntryAttribute.word2hash(w[j]).equals(h)) w[j] = "<b>" + w[j] + "</b>";
}
}
StringBuffer l = new StringBuffer(line.length() + queryHashes.size() * 8);
@ -347,7 +348,7 @@ public class plasmaSnippetCache {
String word;
while (words.hasMoreElements()) {
word = (String) words.nextElement();
map.put(plasmaWordIndexEntry.word2hash(word), new Integer(pos));
map.put(indexEntryAttribute.word2hash(word), new Integer(pos));
pos += word.length() + 1;
}
return map;

@ -130,6 +130,7 @@ import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpHeader;
import de.anomic.http.httpRemoteProxyConfig;
import de.anomic.http.httpc;
import de.anomic.index.indexEntryAttribute;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMSetTools;
@ -1422,8 +1423,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
referrerHash,
0, true,
condenser.RESULT_WORD_ENTROPHY,
plasmaWordIndexEntry.language(entry.url()),
plasmaWordIndexEntry.docType(document.getMimeType()),
indexEntryAttribute.language(entry.url()),
indexEntryAttribute.docType(document.getMimeType()),
(int) entry.size(),
condenser.RESULT_NUMB_WORDS
);
@ -1451,14 +1452,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
(storagePeerHash.trim().length() == 0) ||
((seed = yacyCore.seedDB.getConnected(storagePeerHash))==null)){
words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), document, condenser,
plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType()),
indexEntryAttribute.language(entry.url()), indexEntryAttribute.docType(document.getMimeType()),
ioLinks[0].intValue(), ioLinks[1].intValue());
} else {
HashMap urlCache = new HashMap(1);
urlCache.put(newEntry.hash(),newEntry);
ArrayList tmpContainers = new ArrayList(condenser.RESULT_SIMI_WORDS);
String language = plasmaWordIndexEntry.language(entry.url());
char doctype = plasmaWordIndexEntry.docType(document.getMimeType());
String language = indexEntryAttribute.language(entry.url());
char doctype = indexEntryAttribute.docType(document.getMimeType());
int urlLength = newEntry.url().toString().length();
int urlComps = htmlFilterContentScraper.urlComps(newEntry.url().toString()).length;
@ -1470,7 +1471,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
wentry = (Map.Entry) i.next();
String word = (String) wentry.getKey();
wordStat = (plasmaCondenser.wordStatProp) wentry.getValue();
String wordHash = plasmaWordIndexEntry.word2hash(word);
String wordHash = indexEntryAttribute.word2hash(word);
plasmaWordIndexEntryContainer wordIdxContainer = new plasmaWordIndexEntryContainer(wordHash);
plasmaWordIndexEntry wordIdxEntry = new plasmaWordIndexEntry(urlHash,
urlLength, urlComps,
@ -1509,8 +1510,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (error != null) {
words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(),
document, condenser,
plasmaWordIndexEntry.language(entry.url()),
plasmaWordIndexEntry.docType(document.getMimeType()),
indexEntryAttribute.language(entry.url()),
indexEntryAttribute.docType(document.getMimeType()),
ioLinks[0].intValue(), ioLinks[1].intValue());
}
@ -1990,7 +1991,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
while (iter.hasNext()) {
word = (String) iter.next();
// delete the URL reference in this word index
count += wordIndex.removeEntries(plasmaWordIndexEntry.word2hash(word), urlEntries, true);
count += wordIndex.removeEntries(indexEntryAttribute.word2hash(word), urlEntries, true);
}
return count;
}
@ -2006,7 +2007,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
entry = (Map.Entry) wordStatPropIterator.next();
word = (String) entry.getKey();
// delete the URL reference in this word index
count += wordIndex.removeEntries(plasmaWordIndexEntry.word2hash(word), urlEntries, true);
count += wordIndex.removeEntries(indexEntryAttribute.word2hash(word), urlEntries, true);
}
return count;
}

@ -58,6 +58,7 @@ import java.util.TreeSet;
import java.net.URL;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.index.indexEntryAttribute;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMergeIterator;
@ -247,7 +248,7 @@ public final class plasmaWordIndex {
word = (String) wentry.getKey();
wprop = (plasmaCondenser.wordStatProp) wentry.getValue();
// if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c);
wordHash = plasmaWordIndexEntry.word2hash(word);
wordHash = indexEntryAttribute.word2hash(word);
ientry = new plasmaWordIndexEntry(urlHash,
urlLength, urlComps, (document == null) ? urlLength : document.longTitle.length(),
wprop.count,

@ -56,6 +56,7 @@ import java.io.File;
import java.io.IOException;
import java.util.Iterator;
import de.anomic.index.indexEntryAttribute;
import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroRecords;
import de.anomic.kelondro.kelondroTree;
@ -66,10 +67,10 @@ public final class plasmaWordIndexAssortment {
// environment constants
private static final String assortmentFileName = "indexAssortment";
public static final int[] bufferStructureBasis = new int[]{
plasmaWordIndexEntry.wordHashLength, // a wordHash
indexEntryAttribute.wordHashLength, // a wordHash
4, // occurrence counter
8, // timestamp of last access
plasmaWordIndexEntry.urlHashLength, // corresponding URL hash
indexEntryAttribute.urlHashLength, // corresponding URL hash
plasmaWordIndexEntry.attrSpace // URL attributes
};

@ -48,13 +48,9 @@
package de.anomic.plasma;
import java.net.URL;
import java.util.Properties;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.index.indexEntryAttribute;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.server.serverCodings;
import de.anomic.yacy.yacySeedDB;
// import de.anomic.server.logging.serverLog;
public final class plasmaWordIndexEntry implements Cloneable {
@ -62,10 +58,6 @@ public final class plasmaWordIndexEntry implements Cloneable {
// by the discrete values of the entry
// or by the encoded entry-string
// the size of a word hash
public static final int wordHashLength = yacySeedDB.commonHashLength; // 12
public static final int urlHashLength = yacySeedDB.commonHashLength; // 12
// the size of the index entry attributes
public static final int attrSpace = 24;
@ -86,134 +78,6 @@ public final class plasmaWordIndexEntry implements Cloneable {
private char doctype; // type of source
private char localflag; // indicates if the index was created locally
// doctypes:
public static final char DT_PDFPS = 'p';
public static final char DT_TEXT = 't';
public static final char DT_HTML = 'h';
public static final char DT_DOC = 'd';
public static final char DT_IMAGE = 'i';
public static final char DT_MOVIE = 'm';
public static final char DT_FLASH = 'f';
public static final char DT_SHARE = 's';
public static final char DT_AUDIO = 'a';
public static final char DT_BINARY = 'b';
public static final char DT_UNKNOWN = 'u';
// appearance locations: (used for flags)
public static final int AP_TITLE = 0; // title tag from html header
public static final int AP_H1 = 1; // headline - top level
public static final int AP_H2 = 2; // headline, second level
public static final int AP_H3 = 3; // headline, 3rd level
public static final int AP_H4 = 4; // headline, 4th level
public static final int AP_H5 = 5; // headline, 5th level
public static final int AP_H6 = 6; // headline, 6th level
public static final int AP_TEXT = 7; // word appears in text (used to check validation of other appearances against spam)
public static final int AP_DOM = 8; // word inside an url: in Domain
public static final int AP_PATH = 9; // word inside an url: in path
public static final int AP_IMG = 10; // tag inside image references
public static final int AP_ANCHOR = 11; // anchor description
public static final int AP_ENV = 12; // word appears in environment (similar to anchor appearance)
public static final int AP_BOLD = 13; // may be interpreted as emphasized
public static final int AP_ITALICS = 14; // may be interpreted as emphasized
public static final int AP_WEAK = 15; // for Text that is small or bareley visible
public static final int AP_INVISIBLE = 16; // good for spam detection
public static final int AP_TAG = 17; // for tagged indexeing (i.e. using mp3 tags)
public static final int AP_AUTHOR = 18; // word appears in author name
public static final int AP_OPUS = 19; // word appears in name of opus, which may be an album name (in mp3 tags)
public static final int AP_TRACK = 20; // word appears in track name (i.e. in mp3 tags)
// URL attributes
public static final int UA_LOCAL = 0; // URL was crawled locally
public static final int UA_TILDE = 1; // tilde appears in URL
public static final int UA_REDIRECT = 2; // The URL is a redirection
// local flag attributes
public static final char LT_LOCAL = 'L';
public static final char LT_GLOBAL = 'G';
// create a word hash
public static String word2hash(String word) {
return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase())).substring(0, wordHashLength);
}
// doctype calculation
public static char docType(URL url) {
String path = htmlFilterContentScraper.urlNormalform(url);
// serverLog.logFinest("PLASMA", "docType URL=" + path);
char doctype = doctype = DT_UNKNOWN;
if (path.endsWith(".gif")) { doctype = DT_IMAGE; }
else if (path.endsWith(".jpg")) { doctype = DT_IMAGE; }
else if (path.endsWith(".jpeg")) { doctype = DT_IMAGE; }
else if (path.endsWith(".png")) { doctype = DT_IMAGE; }
else if (path.endsWith(".html")) { doctype = DT_HTML; }
else if (path.endsWith(".txt")) { doctype = DT_TEXT; }
else if (path.endsWith(".doc")) { doctype = DT_DOC; }
else if (path.endsWith(".rtf")) { doctype = DT_DOC; }
else if (path.endsWith(".pdf")) { doctype = DT_PDFPS; }
else if (path.endsWith(".ps")) { doctype = DT_PDFPS; }
else if (path.endsWith(".avi")) { doctype = DT_MOVIE; }
else if (path.endsWith(".mov")) { doctype = DT_MOVIE; }
else if (path.endsWith(".qt")) { doctype = DT_MOVIE; }
else if (path.endsWith(".mpg")) { doctype = DT_MOVIE; }
else if (path.endsWith(".md5")) { doctype = DT_SHARE; }
else if (path.endsWith(".mpeg")) { doctype = DT_MOVIE; }
else if (path.endsWith(".asf")) { doctype = DT_FLASH; }
return doctype;
}
public static char docType(String mime) {
// serverLog.logFinest("PLASMA", "docType mime=" + mime);
char doctype = DT_UNKNOWN;
if (mime == null) doctype = DT_UNKNOWN;
else if (mime.startsWith("image/")) doctype = DT_IMAGE;
else if (mime.endsWith("/gif")) doctype = DT_IMAGE;
else if (mime.endsWith("/jpeg")) doctype = DT_IMAGE;
else if (mime.endsWith("/png")) doctype = DT_IMAGE;
else if (mime.endsWith("/html")) doctype = DT_HTML;
else if (mime.endsWith("/rtf")) doctype = DT_DOC;
else if (mime.endsWith("/pdf")) doctype = DT_PDFPS;
else if (mime.endsWith("/octet-stream")) doctype = DT_BINARY;
else if (mime.endsWith("/x-shockwave-flash")) doctype = DT_FLASH;
else if (mime.endsWith("/msword")) doctype = DT_DOC;
else if (mime.endsWith("/mspowerpoint")) doctype = DT_DOC;
else if (mime.endsWith("/postscript")) doctype = DT_PDFPS;
else if (mime.startsWith("text/")) doctype = DT_TEXT;
else if (mime.startsWith("image/")) doctype = DT_IMAGE;
else if (mime.startsWith("audio/")) doctype = DT_AUDIO;
else if (mime.startsWith("video/")) doctype = DT_MOVIE;
//bz2 = application/x-bzip2
//dvi = application/x-dvi
//gz = application/gzip
//hqx = application/mac-binhex40
//lha = application/x-lzh
//lzh = application/x-lzh
//pac = application/x-ns-proxy-autoconfig
//php = application/x-httpd-php
//phtml = application/x-httpd-php
//rss = application/xml
//tar = application/tar
//tex = application/x-tex
//tgz = application/tar
//torrent = application/x-bittorrent
//xhtml = application/xhtml+xml
//xla = application/msexcel
//xls = application/msexcel
//xsl = application/xml
//xml = application/xml
//Z = application/x-compress
//zip = application/zip
return doctype;
}
// language calculation
public static String language(URL url) {
String language = "uk";
String host = url.getHost();
int pos = host.lastIndexOf(".");
if ((pos > 0) && (host.length() - pos == 3)) language = host.substring(pos + 1).toLowerCase();
return language;
}
// the class instantiation can only be done by a plasmaStore method
// therefore they are all public
public plasmaWordIndexEntry(String urlHash,
@ -255,7 +119,7 @@ public final class plasmaWordIndexEntry implements Cloneable {
this.quality = quality;
this.language = language.getBytes();
this.doctype = doctype;
this.localflag = (local) ? LT_LOCAL : LT_GLOBAL;
this.localflag = (local) ? indexEntryAttribute.LT_LOCAL : indexEntryAttribute.LT_GLOBAL;
}
public plasmaWordIndexEntry(String urlHash, String code) {
@ -299,7 +163,7 @@ public final class plasmaWordIndexEntry implements Cloneable {
this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("q", "__"));
this.language = pr.getProperty("l", "uk").getBytes();
this.doctype = pr.getProperty("d", "u").charAt(0);
this.localflag = pr.getProperty("f", ""+LT_LOCAL).charAt(0);
this.localflag = pr.getProperty("f", ""+indexEntryAttribute.LT_LOCAL).charAt(0);
}
public Object clone() {
@ -412,7 +276,7 @@ public final class plasmaWordIndexEntry implements Cloneable {
public int phrasecount() { return phrasecount; }
public String getLanguage() { return new String(language); }
public char getType() { return doctype; }
public boolean isLocal() { return localflag == LT_LOCAL; }
public boolean isLocal() { return localflag == indexEntryAttribute.LT_LOCAL; }
public boolean isNewer(plasmaWordIndexEntry other) {
if (other == null) return true;
@ -439,7 +303,7 @@ public final class plasmaWordIndexEntry implements Cloneable {
public static void main(String[] args) {
// outputs the word hash to a given word
if (args.length != 1) System.exit(0);
System.out.println("WORDHASH: " + word2hash(args[0]));
System.out.println("WORDHASH: " + indexEntryAttribute.word2hash(args[0]));
}
}

@ -53,6 +53,7 @@ import java.util.Iterator;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpc;
import de.anomic.index.indexEntryAttribute;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSearchRankingProfile;
@ -464,10 +465,10 @@ public final class yacyClient {
//System.out.println("***result count " + results);
// create containers
final int words = wordhashes.length() / plasmaWordIndexEntry.wordHashLength;
final int words = wordhashes.length() / indexEntryAttribute.wordHashLength;
plasmaWordIndexEntryContainer[] container = new plasmaWordIndexEntryContainer[words];
for (int i = 0; i < words; i++) {
container[i] = new plasmaWordIndexEntryContainer(wordhashes.substring(i * plasmaWordIndexEntry.wordHashLength, (i + 1) * plasmaWordIndexEntry.wordHashLength));
container[i] = new plasmaWordIndexEntryContainer(wordhashes.substring(i * indexEntryAttribute.wordHashLength, (i + 1) * indexEntryAttribute.wordHashLength));
}
// insert results to containers
@ -1093,7 +1094,7 @@ public final class yacyClient {
/*final yacyCore core =*/ new yacyCore(sb);
yacyCore.peerActions.loadSeedLists();
final yacySeed target = yacyCore.seedDB.getConnected(args[1]);
final String wordhashe = plasmaWordIndexEntry.word2hash("test");
final String wordhashe = indexEntryAttribute.word2hash("test");
//System.out.println("permission=" + permissionMessage(args[1]));
// should we use the proxy?

@ -72,6 +72,7 @@ import de.anomic.http.httpd;
import de.anomic.http.httpdFileHandler;
import de.anomic.http.httpdProxyHandler;
import de.anomic.http.httpc.response;
import de.anomic.index.indexEntryAttribute;
import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroMap;
@ -1035,7 +1036,7 @@ public final class yacy {
try {
String word;
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(wordlist)));
while ((word = br.readLine()) != null) wordmap.put(plasmaWordIndexEntry.word2hash(word),word);
while ((word = br.readLine()) != null) wordmap.put(indexEntryAttribute.word2hash(word),word);
br.close();
} catch (IOException e) {}
return wordmap;
@ -1140,7 +1141,7 @@ public final class yacy {
Iterator i = stopwords.iterator();
while (i.hasNext()) {
w = (String) i.next();
f = plasmaWordIndexEntity.wordHash2path(dbRoot, plasmaWordIndexEntry.word2hash(w));
f = plasmaWordIndexEntity.wordHash2path(dbRoot, indexEntryAttribute.word2hash(w));
if (f.exists()) {
thisamount = f.length();
if (f.delete()) {

Loading…
Cancel
Save