start with refactoring of index management

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2110 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent 63a08307c2
commit a474669338

@ -57,6 +57,7 @@ import java.util.TreeMap;
import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.index.indexEntryAttribute;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL; import de.anomic.plasma.plasmaURL;
@ -167,7 +168,7 @@ public class IndexControl_p {
switchboard.wordIndex.deleteIndex(keyhash); switchboard.wordIndex.deleteIndex(keyhash);
post.remove("keyhashdeleteall"); post.remove("keyhashdeleteall");
if (keystring.length() > 0 && if (keystring.length() > 0 &&
plasmaWordIndexEntry.word2hash(keystring).equals(keyhash)) { indexEntryAttribute.word2hash(keystring).equals(keyhash)) {
post.put("keystringsearch", "generated"); post.put("keystringsearch", "generated");
} else { } else {
post.put("keyhashsearch", "generated"); post.put("keyhashsearch", "generated");
@ -189,7 +190,7 @@ public class IndexControl_p {
// thinks that it was called for a list presentation // thinks that it was called for a list presentation
post.remove("keyhashdelete"); post.remove("keyhashdelete");
if (keystring.length() > 0 && if (keystring.length() > 0 &&
plasmaWordIndexEntry.word2hash(keystring).equals(keyhash)) { indexEntryAttribute.word2hash(keystring).equals(keyhash)) {
post.put("keystringsearch", "generated"); post.put("keystringsearch", "generated");
} else { } else {
post.put("keyhashsearch", "generated"); post.put("keyhashsearch", "generated");
@ -220,7 +221,7 @@ public class IndexControl_p {
} }
if (post.containsKey("keystringsearch")) { if (post.containsKey("keystringsearch")) {
keyhash = plasmaWordIndexEntry.word2hash(keystring); keyhash = indexEntryAttribute.word2hash(keystring);
prop.put("keyhash", keyhash); prop.put("keyhash", keyhash);
prop.put("urlstring", ""); prop.put("urlstring", "");
prop.put("urlhash", ""); prop.put("urlhash", "");
@ -229,7 +230,7 @@ public class IndexControl_p {
if (post.containsKey("keyhashsearch")) { if (post.containsKey("keyhashsearch")) {
if (keystring.length() == 0 || if (keystring.length() == 0 ||
!plasmaWordIndexEntry.word2hash(keystring).equals(keyhash)) { !indexEntryAttribute.word2hash(keystring).equals(keyhash)) {
prop.put("keystring", "<not possible to compute word from hash>"); prop.put("keystring", "<not possible to compute word from hash>");
} }
prop.put("urlstring", ""); prop.put("urlstring", "");
@ -240,7 +241,7 @@ public class IndexControl_p {
// transfer to other peer // transfer to other peer
if (post.containsKey("keyhashtransfer")) { if (post.containsKey("keyhashtransfer")) {
if (keystring.length() == 0 || if (keystring.length() == 0 ||
!plasmaWordIndexEntry.word2hash(keystring).equals(keyhash)) { !indexEntryAttribute.word2hash(keystring).equals(keyhash)) {
prop.put("keystring", "<not possible to compute word from hash>"); prop.put("keystring", "<not possible to compute word from hash>");
} }
prop.put("urlstring", ""); prop.put("urlstring", "");

@ -58,12 +58,12 @@ import java.util.Map;
import java.net.InetAddress; import java.net.InetAddress;
import java.net.UnknownHostException; import java.net.UnknownHostException;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.index.indexEntryAttribute;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.plasma.plasmaCondenser; import de.anomic.plasma.plasmaCondenser;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaURL; import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.plasmaWordIndexEntry;
import de.anomic.server.serverCodings; import de.anomic.server.serverCodings;
import de.anomic.server.serverCore; import de.anomic.server.serverCore;
import de.anomic.server.serverFileUtils; import de.anomic.server.serverFileUtils;
@ -463,7 +463,7 @@ public class dir {
false, /*localneed*/ false, /*localneed*/
condenser.RESULT_WORD_ENTROPHY, condenser.RESULT_WORD_ENTROPHY,
"**", /*language*/ "**", /*language*/
plasmaWordIndexEntry.DT_SHARE, /*doctype*/ indexEntryAttribute.DT_SHARE, /*doctype*/
phrase.length(), /*size*/ phrase.length(), /*size*/
condenser.RESULT_NUMB_WORDS condenser.RESULT_NUMB_WORDS
); );
@ -476,7 +476,7 @@ public class dir {
); );
final String urlHash = newEntry.hash(); final String urlHash = newEntry.hash();
/*final int words =*/ switchboard.wordIndex.addPageIndex(url, urlHash, new Date(), phrase.length() + descr.length() + 13, null, condenser, "**", plasmaWordIndexEntry.DT_SHARE, 0, 0); /*final int words =*/ switchboard.wordIndex.addPageIndex(url, urlHash, new Date(), phrase.length() + descr.length() + 13, null, condenser, "**", indexEntryAttribute.DT_SHARE, 0, 0);
} catch (IOException e) {} } catch (IOException e) {}
} }
@ -487,7 +487,7 @@ public class dir {
Map.Entry entry; Map.Entry entry;
while (words.hasNext()) { while (words.hasNext()) {
entry = (Map.Entry) words.next(); entry = (Map.Entry) words.next();
switchboard.wordIndex.removeEntries(plasmaWordIndexEntry.word2hash((String) entry.getKey()), new String[] {urlhash}, true); switchboard.wordIndex.removeEntries(indexEntryAttribute.word2hash((String) entry.getKey()), new String[] {urlhash}, true);
} }
switchboard.urlPool.loadedURL.remove(urlhash); switchboard.urlPool.loadedURL.remove(urlhash);
} catch (Exception e) { } catch (Exception e) {

@ -49,6 +49,7 @@
import java.util.HashSet; import java.util.HashSet;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.index.indexEntryAttribute;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSearchEvent; import de.anomic.plasma.plasmaSearchEvent;
import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSearchRankingProfile;
@ -56,7 +57,6 @@ import de.anomic.plasma.plasmaSearchResult;
import de.anomic.plasma.plasmaSearchTimingProfile; import de.anomic.plasma.plasmaSearchTimingProfile;
import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSnippetCache;
import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWordIndexEntry;
import de.anomic.plasma.plasmaSearchQuery; import de.anomic.plasma.plasmaSearchQuery;
import de.anomic.server.serverCore; import de.anomic.server.serverCore;
import de.anomic.server.serverObjects; import de.anomic.server.serverObjects;
@ -100,9 +100,9 @@ public final class search {
} }
// prepare search // prepare search
final HashSet keyhashes = new HashSet(query.length() / plasmaWordIndexEntry.wordHashLength); final HashSet keyhashes = new HashSet(query.length() / indexEntryAttribute.wordHashLength);
for (int i = 0; i < (query.length() / plasmaWordIndexEntry.wordHashLength); i++) { for (int i = 0; i < (query.length() / indexEntryAttribute.wordHashLength); i++) {
keyhashes.add(query.substring(i * plasmaWordIndexEntry.wordHashLength, (i + 1) * plasmaWordIndexEntry.wordHashLength)); keyhashes.add(query.substring(i * indexEntryAttribute.wordHashLength, (i + 1) * indexEntryAttribute.wordHashLength));
} }
final long timestamp = System.currentTimeMillis(); final long timestamp = System.currentTimeMillis();

@ -69,11 +69,11 @@ import org.w3c.dom.NodeList;
import org.xml.sax.SAXException; import org.xml.sax.SAXException;
import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.htmlFilter.htmlFilterOutputStream; import de.anomic.htmlFilter.htmlFilterOutputStream;
import de.anomic.index.indexEntryAttribute;
import de.anomic.kelondro.kelondroDyn; import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMap; import de.anomic.kelondro.kelondroMap;
import de.anomic.plasma.plasmaURL; import de.anomic.plasma.plasmaURL;
import de.anomic.plasma.plasmaWordIndexEntry;
import de.anomic.server.serverFileUtils; import de.anomic.server.serverFileUtils;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
@ -85,7 +85,7 @@ public class bookmarksDB {
HashMap bookmarkCache; HashMap bookmarkCache;
public static String tagHash(String tagName){ public static String tagHash(String tagName){
return plasmaWordIndexEntry.word2hash(tagName.toLowerCase()); return indexEntryAttribute.word2hash(tagName.toLowerCase());
} }
public static String dateToiso8601(Date date){ public static String dateToiso8601(Date date){
return new SimpleDateFormat("yyyy-MM-dd").format(date)+"T"+(new SimpleDateFormat("HH:mm:ss")).format(date)+"Z"; return new SimpleDateFormat("yyyy-MM-dd").format(date)+"T"+(new SimpleDateFormat("HH:mm:ss")).format(date)+"Z";

@ -0,0 +1,172 @@
// indexEntryAttribute.java
// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 16.05.2006 on http://www.anomic.de
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.index;
import java.net.URL;
import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.server.serverCodings;
import de.anomic.yacy.yacySeedDB;
public class indexEntryAttribute {
// the size of a word hash
public static final int wordHashLength = yacySeedDB.commonHashLength; // 12
public static final int urlHashLength = yacySeedDB.commonHashLength; // 12
// doctypes:
public static final char DT_PDFPS = 'p';
public static final char DT_TEXT = 't';
public static final char DT_HTML = 'h';
public static final char DT_DOC = 'd';
public static final char DT_IMAGE = 'i';
public static final char DT_MOVIE = 'm';
public static final char DT_FLASH = 'f';
public static final char DT_SHARE = 's';
public static final char DT_AUDIO = 'a';
public static final char DT_BINARY = 'b';
public static final char DT_UNKNOWN = 'u';
// appearance locations: (used for flags)
public static final int AP_TITLE = 0; // title tag from html header
public static final int AP_H1 = 1; // headline - top level
public static final int AP_H2 = 2; // headline, second level
public static final int AP_H3 = 3; // headline, 3rd level
public static final int AP_H4 = 4; // headline, 4th level
public static final int AP_H5 = 5; // headline, 5th level
public static final int AP_H6 = 6; // headline, 6th level
public static final int AP_TEXT = 7; // word appears in text (used to check validation of other appearances against spam)
public static final int AP_DOM = 8; // word inside an url: in Domain
public static final int AP_PATH = 9; // word inside an url: in path
public static final int AP_IMG = 10; // tag inside image references
public static final int AP_ANCHOR = 11; // anchor description
public static final int AP_ENV = 12; // word appears in environment (similar to anchor appearance)
public static final int AP_BOLD = 13; // may be interpreted as emphasized
public static final int AP_ITALICS = 14; // may be interpreted as emphasized
public static final int AP_WEAK = 15; // for Text that is small or bareley visible
public static final int AP_INVISIBLE = 16; // good for spam detection
public static final int AP_TAG = 17; // for tagged indexeing (i.e. using mp3 tags)
public static final int AP_AUTHOR = 18; // word appears in author name
public static final int AP_OPUS = 19; // word appears in name of opus, which may be an album name (in mp3 tags)
public static final int AP_TRACK = 20; // word appears in track name (i.e. in mp3 tags)
// URL attributes
public static final int UA_LOCAL = 0; // URL was crawled locally
public static final int UA_TILDE = 1; // tilde appears in URL
public static final int UA_REDIRECT = 2; // The URL is a redirection
// local flag attributes
public static final char LT_LOCAL = 'L';
public static final char LT_GLOBAL = 'G';
// create a word hash
public static String word2hash(String word) {
return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase())).substring(0, indexEntryAttribute.wordHashLength);
}
// doctype calculation
public static char docType(URL url) {
String path = htmlFilterContentScraper.urlNormalform(url);
// serverLog.logFinest("PLASMA", "docType URL=" + path);
char doctype = doctype = indexEntryAttribute.DT_UNKNOWN;
if (path.endsWith(".gif")) { doctype = indexEntryAttribute.DT_IMAGE; }
else if (path.endsWith(".jpg")) { doctype = indexEntryAttribute.DT_IMAGE; }
else if (path.endsWith(".jpeg")) { doctype = indexEntryAttribute.DT_IMAGE; }
else if (path.endsWith(".png")) { doctype = indexEntryAttribute.DT_IMAGE; }
else if (path.endsWith(".html")) { doctype = indexEntryAttribute.DT_HTML; }
else if (path.endsWith(".txt")) { doctype = indexEntryAttribute.DT_TEXT; }
else if (path.endsWith(".doc")) { doctype = indexEntryAttribute.DT_DOC; }
else if (path.endsWith(".rtf")) { doctype = indexEntryAttribute.DT_DOC; }
else if (path.endsWith(".pdf")) { doctype = indexEntryAttribute.DT_PDFPS; }
else if (path.endsWith(".ps")) { doctype = indexEntryAttribute.DT_PDFPS; }
else if (path.endsWith(".avi")) { doctype = indexEntryAttribute.DT_MOVIE; }
else if (path.endsWith(".mov")) { doctype = indexEntryAttribute.DT_MOVIE; }
else if (path.endsWith(".qt")) { doctype = indexEntryAttribute.DT_MOVIE; }
else if (path.endsWith(".mpg")) { doctype = indexEntryAttribute.DT_MOVIE; }
else if (path.endsWith(".md5")) { doctype = indexEntryAttribute.DT_SHARE; }
else if (path.endsWith(".mpeg")) { doctype = indexEntryAttribute.DT_MOVIE; }
else if (path.endsWith(".asf")) { doctype = indexEntryAttribute.DT_FLASH; }
return doctype;
}
public static char docType(String mime) {
// serverLog.logFinest("PLASMA", "docType mime=" + mime);
char doctype = indexEntryAttribute.DT_UNKNOWN;
if (mime == null) doctype = indexEntryAttribute.DT_UNKNOWN;
else if (mime.startsWith("image/")) doctype = indexEntryAttribute.DT_IMAGE;
else if (mime.endsWith("/gif")) doctype = indexEntryAttribute.DT_IMAGE;
else if (mime.endsWith("/jpeg")) doctype = indexEntryAttribute.DT_IMAGE;
else if (mime.endsWith("/png")) doctype = indexEntryAttribute.DT_IMAGE;
else if (mime.endsWith("/html")) doctype = indexEntryAttribute.DT_HTML;
else if (mime.endsWith("/rtf")) doctype = indexEntryAttribute.DT_DOC;
else if (mime.endsWith("/pdf")) doctype = indexEntryAttribute.DT_PDFPS;
else if (mime.endsWith("/octet-stream")) doctype = indexEntryAttribute.DT_BINARY;
else if (mime.endsWith("/x-shockwave-flash")) doctype = indexEntryAttribute.DT_FLASH;
else if (mime.endsWith("/msword")) doctype = indexEntryAttribute.DT_DOC;
else if (mime.endsWith("/mspowerpoint")) doctype = indexEntryAttribute.DT_DOC;
else if (mime.endsWith("/postscript")) doctype = indexEntryAttribute.DT_PDFPS;
else if (mime.startsWith("text/")) doctype = indexEntryAttribute.DT_TEXT;
else if (mime.startsWith("image/")) doctype = indexEntryAttribute.DT_IMAGE;
else if (mime.startsWith("audio/")) doctype = indexEntryAttribute.DT_AUDIO;
else if (mime.startsWith("video/")) doctype = indexEntryAttribute.DT_MOVIE;
//bz2 = application/x-bzip2
//dvi = application/x-dvi
//gz = application/gzip
//hqx = application/mac-binhex40
//lha = application/x-lzh
//lzh = application/x-lzh
//pac = application/x-ns-proxy-autoconfig
//php = application/x-httpd-php
//phtml = application/x-httpd-php
//rss = application/xml
//tar = application/tar
//tex = application/x-tex
//tgz = application/tar
//torrent = application/x-bittorrent
//xhtml = application/xhtml+xml
//xla = application/msexcel
//xls = application/msexcel
//xsl = application/xml
//xml = application/xml
//Z = application/x-compress
//zip = application/zip
return doctype;
}
// language calculation
public static String language(URL url) {
String language = "uk";
String host = url.getHost();
int pos = host.lastIndexOf(".");
if ((pos > 0) && (host.length() - pos == 3)) language = host.substring(pos + 1).toLowerCase();
return language;
}
}

@ -56,6 +56,7 @@ package de.anomic.plasma;
import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpc; import de.anomic.http.httpc;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.index.indexEntryAttribute;
import de.anomic.kelondro.kelondroDyn; import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroMap; import de.anomic.kelondro.kelondroMap;
import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroMScoreCluster;
@ -761,9 +762,9 @@ public final class plasmaHTCache {
this.lastModified = responseHeader.lastModified(); this.lastModified = responseHeader.lastModified();
if (this.lastModified == null) this.lastModified = new Date(serverDate.correctedUTCTime()); // does not exist in header if (this.lastModified == null) this.lastModified = new Date(serverDate.correctedUTCTime()); // does not exist in header
} }
this.doctype = plasmaWordIndexEntry.docType(responseHeader.mime()); this.doctype = indexEntryAttribute.docType(responseHeader.mime());
if (this.doctype == plasmaWordIndexEntry.DT_UNKNOWN) this.doctype = plasmaWordIndexEntry.docType(url); if (this.doctype == indexEntryAttribute.DT_UNKNOWN) this.doctype = indexEntryAttribute.docType(url);
this.language = plasmaWordIndexEntry.language(url); this.language = indexEntryAttribute.language(url);
// to be defined later: // to be defined later:
this.cacheArray = null; this.cacheArray = null;

@ -47,6 +47,7 @@ import java.util.TreeSet;
import java.util.Iterator; import java.util.Iterator;
import de.anomic.htmlFilter.htmlFilterAbstractScraper; import de.anomic.htmlFilter.htmlFilterAbstractScraper;
import de.anomic.index.indexEntryAttribute;
import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.kelondro.kelondroNaturalOrder;
import de.anomic.server.serverByteBuffer; import de.anomic.server.serverByteBuffer;
@ -100,14 +101,14 @@ public final class plasmaSearchQuery {
public static Set words2hashes(String[] words) { public static Set words2hashes(String[] words) {
TreeSet hashes = new TreeSet(); TreeSet hashes = new TreeSet();
for (int i = 0; i < words.length; i++) hashes.add(plasmaWordIndexEntry.word2hash(words[i])); for (int i = 0; i < words.length; i++) hashes.add(indexEntryAttribute.word2hash(words[i]));
return hashes; return hashes;
} }
public static Set words2hashes(Set words) { public static Set words2hashes(Set words) {
Iterator i = words.iterator(); Iterator i = words.iterator();
TreeSet hashes = new TreeSet(); TreeSet hashes = new TreeSet();
while (i.hasNext()) hashes.add(plasmaWordIndexEntry.word2hash((String) i.next())); while (i.hasNext()) hashes.add(indexEntryAttribute.word2hash((String) i.next()));
return hashes; return hashes;
} }

@ -55,6 +55,7 @@ import java.net.MalformedURLException;
import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.server.serverCodings; import de.anomic.server.serverCodings;
import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.index.indexEntryAttribute;
public final class plasmaSearchResult { public final class plasmaSearchResult {
@ -249,7 +250,7 @@ public final class plasmaSearchResult {
word = words[i].toLowerCase(); word = words[i].toLowerCase();
if ((word.length() > 2) && if ((word.length() > 2) &&
("http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0) && ("http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0) &&
(!(query.queryHashes.contains(plasmaWordIndexEntry.word2hash(word))))) (!(query.queryHashes.contains(indexEntryAttribute.word2hash(word)))))
ref.incScore(word); ref.incScore(word);
} }
} }

@ -55,6 +55,7 @@ import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.server.logging.serverLog; import de.anomic.server.logging.serverLog;
import de.anomic.yacy.yacySearch; import de.anomic.yacy.yacySearch;
import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.index.indexEntryAttribute;
public class plasmaSnippetCache { public class plasmaSnippetCache {
@ -125,7 +126,7 @@ public class plasmaSnippetCache {
while (i.hasNext()) { while (i.hasNext()) {
h = (String) i.next(); h = (String) i.next();
for (int j = 0; j < w.length; j++) { for (int j = 0; j < w.length; j++) {
if (plasmaWordIndexEntry.word2hash(w[j]).equals(h)) w[j] = "<b>" + w[j] + "</b>"; if (indexEntryAttribute.word2hash(w[j]).equals(h)) w[j] = "<b>" + w[j] + "</b>";
} }
} }
StringBuffer l = new StringBuffer(line.length() + queryHashes.size() * 8); StringBuffer l = new StringBuffer(line.length() + queryHashes.size() * 8);
@ -347,7 +348,7 @@ public class plasmaSnippetCache {
String word; String word;
while (words.hasMoreElements()) { while (words.hasMoreElements()) {
word = (String) words.nextElement(); word = (String) words.nextElement();
map.put(plasmaWordIndexEntry.word2hash(word), new Integer(pos)); map.put(indexEntryAttribute.word2hash(word), new Integer(pos));
pos += word.length() + 1; pos += word.length() + 1;
} }
return map; return map;

@ -130,6 +130,7 @@ import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpHeader; import de.anomic.http.httpHeader;
import de.anomic.http.httpRemoteProxyConfig; import de.anomic.http.httpRemoteProxyConfig;
import de.anomic.http.httpc; import de.anomic.http.httpc;
import de.anomic.index.indexEntryAttribute;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMSetTools; import de.anomic.kelondro.kelondroMSetTools;
@ -1422,8 +1423,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
referrerHash, referrerHash,
0, true, 0, true,
condenser.RESULT_WORD_ENTROPHY, condenser.RESULT_WORD_ENTROPHY,
plasmaWordIndexEntry.language(entry.url()), indexEntryAttribute.language(entry.url()),
plasmaWordIndexEntry.docType(document.getMimeType()), indexEntryAttribute.docType(document.getMimeType()),
(int) entry.size(), (int) entry.size(),
condenser.RESULT_NUMB_WORDS condenser.RESULT_NUMB_WORDS
); );
@ -1451,14 +1452,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
(storagePeerHash.trim().length() == 0) || (storagePeerHash.trim().length() == 0) ||
((seed = yacyCore.seedDB.getConnected(storagePeerHash))==null)){ ((seed = yacyCore.seedDB.getConnected(storagePeerHash))==null)){
words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), document, condenser, words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), document, condenser,
plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType()), indexEntryAttribute.language(entry.url()), indexEntryAttribute.docType(document.getMimeType()),
ioLinks[0].intValue(), ioLinks[1].intValue()); ioLinks[0].intValue(), ioLinks[1].intValue());
} else { } else {
HashMap urlCache = new HashMap(1); HashMap urlCache = new HashMap(1);
urlCache.put(newEntry.hash(),newEntry); urlCache.put(newEntry.hash(),newEntry);
ArrayList tmpContainers = new ArrayList(condenser.RESULT_SIMI_WORDS); ArrayList tmpContainers = new ArrayList(condenser.RESULT_SIMI_WORDS);
String language = plasmaWordIndexEntry.language(entry.url()); String language = indexEntryAttribute.language(entry.url());
char doctype = plasmaWordIndexEntry.docType(document.getMimeType()); char doctype = indexEntryAttribute.docType(document.getMimeType());
int urlLength = newEntry.url().toString().length(); int urlLength = newEntry.url().toString().length();
int urlComps = htmlFilterContentScraper.urlComps(newEntry.url().toString()).length; int urlComps = htmlFilterContentScraper.urlComps(newEntry.url().toString()).length;
@ -1470,7 +1471,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
wentry = (Map.Entry) i.next(); wentry = (Map.Entry) i.next();
String word = (String) wentry.getKey(); String word = (String) wentry.getKey();
wordStat = (plasmaCondenser.wordStatProp) wentry.getValue(); wordStat = (plasmaCondenser.wordStatProp) wentry.getValue();
String wordHash = plasmaWordIndexEntry.word2hash(word); String wordHash = indexEntryAttribute.word2hash(word);
plasmaWordIndexEntryContainer wordIdxContainer = new plasmaWordIndexEntryContainer(wordHash); plasmaWordIndexEntryContainer wordIdxContainer = new plasmaWordIndexEntryContainer(wordHash);
plasmaWordIndexEntry wordIdxEntry = new plasmaWordIndexEntry(urlHash, plasmaWordIndexEntry wordIdxEntry = new plasmaWordIndexEntry(urlHash,
urlLength, urlComps, urlLength, urlComps,
@ -1509,8 +1510,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (error != null) { if (error != null) {
words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(),
document, condenser, document, condenser,
plasmaWordIndexEntry.language(entry.url()), indexEntryAttribute.language(entry.url()),
plasmaWordIndexEntry.docType(document.getMimeType()), indexEntryAttribute.docType(document.getMimeType()),
ioLinks[0].intValue(), ioLinks[1].intValue()); ioLinks[0].intValue(), ioLinks[1].intValue());
} }
@ -1990,7 +1991,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
while (iter.hasNext()) { while (iter.hasNext()) {
word = (String) iter.next(); word = (String) iter.next();
// delete the URL reference in this word index // delete the URL reference in this word index
count += wordIndex.removeEntries(plasmaWordIndexEntry.word2hash(word), urlEntries, true); count += wordIndex.removeEntries(indexEntryAttribute.word2hash(word), urlEntries, true);
} }
return count; return count;
} }
@ -2006,7 +2007,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
entry = (Map.Entry) wordStatPropIterator.next(); entry = (Map.Entry) wordStatPropIterator.next();
word = (String) entry.getKey(); word = (String) entry.getKey();
// delete the URL reference in this word index // delete the URL reference in this word index
count += wordIndex.removeEntries(plasmaWordIndexEntry.word2hash(word), urlEntries, true); count += wordIndex.removeEntries(indexEntryAttribute.word2hash(word), urlEntries, true);
} }
return count; return count;
} }

@ -58,6 +58,7 @@ import java.util.TreeSet;
import java.net.URL; import java.net.URL;
import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.index.indexEntryAttribute;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroMergeIterator; import de.anomic.kelondro.kelondroMergeIterator;
@ -247,7 +248,7 @@ public final class plasmaWordIndex {
word = (String) wentry.getKey(); word = (String) wentry.getKey();
wprop = (plasmaCondenser.wordStatProp) wentry.getValue(); wprop = (plasmaCondenser.wordStatProp) wentry.getValue();
// if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c); // if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c);
wordHash = plasmaWordIndexEntry.word2hash(word); wordHash = indexEntryAttribute.word2hash(word);
ientry = new plasmaWordIndexEntry(urlHash, ientry = new plasmaWordIndexEntry(urlHash,
urlLength, urlComps, (document == null) ? urlLength : document.longTitle.length(), urlLength, urlComps, (document == null) ? urlLength : document.longTitle.length(),
wprop.count, wprop.count,

@ -56,6 +56,7 @@ import java.io.File;
import java.io.IOException; import java.io.IOException;
import java.util.Iterator; import java.util.Iterator;
import de.anomic.index.indexEntryAttribute;
import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroException;
import de.anomic.kelondro.kelondroRecords; import de.anomic.kelondro.kelondroRecords;
import de.anomic.kelondro.kelondroTree; import de.anomic.kelondro.kelondroTree;
@ -66,10 +67,10 @@ public final class plasmaWordIndexAssortment {
// environment constants // environment constants
private static final String assortmentFileName = "indexAssortment"; private static final String assortmentFileName = "indexAssortment";
public static final int[] bufferStructureBasis = new int[]{ public static final int[] bufferStructureBasis = new int[]{
plasmaWordIndexEntry.wordHashLength, // a wordHash indexEntryAttribute.wordHashLength, // a wordHash
4, // occurrence counter 4, // occurrence counter
8, // timestamp of last access 8, // timestamp of last access
plasmaWordIndexEntry.urlHashLength, // corresponding URL hash indexEntryAttribute.urlHashLength, // corresponding URL hash
plasmaWordIndexEntry.attrSpace // URL attributes plasmaWordIndexEntry.attrSpace // URL attributes
}; };

@ -48,13 +48,9 @@
package de.anomic.plasma; package de.anomic.plasma;
import java.net.URL;
import java.util.Properties; import java.util.Properties;
import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.index.indexEntryAttribute;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.server.serverCodings;
import de.anomic.yacy.yacySeedDB;
// import de.anomic.server.logging.serverLog;
public final class plasmaWordIndexEntry implements Cloneable { public final class plasmaWordIndexEntry implements Cloneable {
@ -62,10 +58,6 @@ public final class plasmaWordIndexEntry implements Cloneable {
// by the discrete values of the entry // by the discrete values of the entry
// or by the encoded entry-string // or by the encoded entry-string
// the size of a word hash
public static final int wordHashLength = yacySeedDB.commonHashLength; // 12
public static final int urlHashLength = yacySeedDB.commonHashLength; // 12
// the size of the index entry attributes // the size of the index entry attributes
public static final int attrSpace = 24; public static final int attrSpace = 24;
@ -86,134 +78,6 @@ public final class plasmaWordIndexEntry implements Cloneable {
private char doctype; // type of source private char doctype; // type of source
private char localflag; // indicates if the index was created locally private char localflag; // indicates if the index was created locally
// doctypes:
public static final char DT_PDFPS = 'p';
public static final char DT_TEXT = 't';
public static final char DT_HTML = 'h';
public static final char DT_DOC = 'd';
public static final char DT_IMAGE = 'i';
public static final char DT_MOVIE = 'm';
public static final char DT_FLASH = 'f';
public static final char DT_SHARE = 's';
public static final char DT_AUDIO = 'a';
public static final char DT_BINARY = 'b';
public static final char DT_UNKNOWN = 'u';
// appearance locations: (used for flags)
public static final int AP_TITLE = 0; // title tag from html header
public static final int AP_H1 = 1; // headline - top level
public static final int AP_H2 = 2; // headline, second level
public static final int AP_H3 = 3; // headline, 3rd level
public static final int AP_H4 = 4; // headline, 4th level
public static final int AP_H5 = 5; // headline, 5th level
public static final int AP_H6 = 6; // headline, 6th level
public static final int AP_TEXT = 7; // word appears in text (used to check validation of other appearances against spam)
public static final int AP_DOM = 8; // word inside an url: in Domain
public static final int AP_PATH = 9; // word inside an url: in path
public static final int AP_IMG = 10; // tag inside image references
public static final int AP_ANCHOR = 11; // anchor description
public static final int AP_ENV = 12; // word appears in environment (similar to anchor appearance)
public static final int AP_BOLD = 13; // may be interpreted as emphasized
public static final int AP_ITALICS = 14; // may be interpreted as emphasized
public static final int AP_WEAK = 15; // for Text that is small or bareley visible
public static final int AP_INVISIBLE = 16; // good for spam detection
public static final int AP_TAG = 17; // for tagged indexeing (i.e. using mp3 tags)
public static final int AP_AUTHOR = 18; // word appears in author name
public static final int AP_OPUS = 19; // word appears in name of opus, which may be an album name (in mp3 tags)
public static final int AP_TRACK = 20; // word appears in track name (i.e. in mp3 tags)
// URL attributes
public static final int UA_LOCAL = 0; // URL was crawled locally
public static final int UA_TILDE = 1; // tilde appears in URL
public static final int UA_REDIRECT = 2; // The URL is a redirection
// local flag attributes
public static final char LT_LOCAL = 'L';
public static final char LT_GLOBAL = 'G';
// create a word hash
public static String word2hash(String word) {
return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase())).substring(0, wordHashLength);
}
// doctype calculation
public static char docType(URL url) {
String path = htmlFilterContentScraper.urlNormalform(url);
// serverLog.logFinest("PLASMA", "docType URL=" + path);
char doctype = doctype = DT_UNKNOWN;
if (path.endsWith(".gif")) { doctype = DT_IMAGE; }
else if (path.endsWith(".jpg")) { doctype = DT_IMAGE; }
else if (path.endsWith(".jpeg")) { doctype = DT_IMAGE; }
else if (path.endsWith(".png")) { doctype = DT_IMAGE; }
else if (path.endsWith(".html")) { doctype = DT_HTML; }
else if (path.endsWith(".txt")) { doctype = DT_TEXT; }
else if (path.endsWith(".doc")) { doctype = DT_DOC; }
else if (path.endsWith(".rtf")) { doctype = DT_DOC; }
else if (path.endsWith(".pdf")) { doctype = DT_PDFPS; }
else if (path.endsWith(".ps")) { doctype = DT_PDFPS; }
else if (path.endsWith(".avi")) { doctype = DT_MOVIE; }
else if (path.endsWith(".mov")) { doctype = DT_MOVIE; }
else if (path.endsWith(".qt")) { doctype = DT_MOVIE; }
else if (path.endsWith(".mpg")) { doctype = DT_MOVIE; }
else if (path.endsWith(".md5")) { doctype = DT_SHARE; }
else if (path.endsWith(".mpeg")) { doctype = DT_MOVIE; }
else if (path.endsWith(".asf")) { doctype = DT_FLASH; }
return doctype;
}
public static char docType(String mime) {
// serverLog.logFinest("PLASMA", "docType mime=" + mime);
char doctype = DT_UNKNOWN;
if (mime == null) doctype = DT_UNKNOWN;
else if (mime.startsWith("image/")) doctype = DT_IMAGE;
else if (mime.endsWith("/gif")) doctype = DT_IMAGE;
else if (mime.endsWith("/jpeg")) doctype = DT_IMAGE;
else if (mime.endsWith("/png")) doctype = DT_IMAGE;
else if (mime.endsWith("/html")) doctype = DT_HTML;
else if (mime.endsWith("/rtf")) doctype = DT_DOC;
else if (mime.endsWith("/pdf")) doctype = DT_PDFPS;
else if (mime.endsWith("/octet-stream")) doctype = DT_BINARY;
else if (mime.endsWith("/x-shockwave-flash")) doctype = DT_FLASH;
else if (mime.endsWith("/msword")) doctype = DT_DOC;
else if (mime.endsWith("/mspowerpoint")) doctype = DT_DOC;
else if (mime.endsWith("/postscript")) doctype = DT_PDFPS;
else if (mime.startsWith("text/")) doctype = DT_TEXT;
else if (mime.startsWith("image/")) doctype = DT_IMAGE;
else if (mime.startsWith("audio/")) doctype = DT_AUDIO;
else if (mime.startsWith("video/")) doctype = DT_MOVIE;
//bz2 = application/x-bzip2
//dvi = application/x-dvi
//gz = application/gzip
//hqx = application/mac-binhex40
//lha = application/x-lzh
//lzh = application/x-lzh
//pac = application/x-ns-proxy-autoconfig
//php = application/x-httpd-php
//phtml = application/x-httpd-php
//rss = application/xml
//tar = application/tar
//tex = application/x-tex
//tgz = application/tar
//torrent = application/x-bittorrent
//xhtml = application/xhtml+xml
//xla = application/msexcel
//xls = application/msexcel
//xsl = application/xml
//xml = application/xml
//Z = application/x-compress
//zip = application/zip
return doctype;
}
// language calculation
public static String language(URL url) {
String language = "uk";
String host = url.getHost();
int pos = host.lastIndexOf(".");
if ((pos > 0) && (host.length() - pos == 3)) language = host.substring(pos + 1).toLowerCase();
return language;
}
// the class instantiation can only be done by a plasmaStore method // the class instantiation can only be done by a plasmaStore method
// therefore they are all public // therefore they are all public
public plasmaWordIndexEntry(String urlHash, public plasmaWordIndexEntry(String urlHash,
@ -255,7 +119,7 @@ public final class plasmaWordIndexEntry implements Cloneable {
this.quality = quality; this.quality = quality;
this.language = language.getBytes(); this.language = language.getBytes();
this.doctype = doctype; this.doctype = doctype;
this.localflag = (local) ? LT_LOCAL : LT_GLOBAL; this.localflag = (local) ? indexEntryAttribute.LT_LOCAL : indexEntryAttribute.LT_GLOBAL;
} }
public plasmaWordIndexEntry(String urlHash, String code) { public plasmaWordIndexEntry(String urlHash, String code) {
@ -299,7 +163,7 @@ public final class plasmaWordIndexEntry implements Cloneable {
this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("q", "__")); this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("q", "__"));
this.language = pr.getProperty("l", "uk").getBytes(); this.language = pr.getProperty("l", "uk").getBytes();
this.doctype = pr.getProperty("d", "u").charAt(0); this.doctype = pr.getProperty("d", "u").charAt(0);
this.localflag = pr.getProperty("f", ""+LT_LOCAL).charAt(0); this.localflag = pr.getProperty("f", ""+indexEntryAttribute.LT_LOCAL).charAt(0);
} }
public Object clone() { public Object clone() {
@ -412,7 +276,7 @@ public final class plasmaWordIndexEntry implements Cloneable {
public int phrasecount() { return phrasecount; } public int phrasecount() { return phrasecount; }
public String getLanguage() { return new String(language); } public String getLanguage() { return new String(language); }
public char getType() { return doctype; } public char getType() { return doctype; }
public boolean isLocal() { return localflag == LT_LOCAL; } public boolean isLocal() { return localflag == indexEntryAttribute.LT_LOCAL; }
public boolean isNewer(plasmaWordIndexEntry other) { public boolean isNewer(plasmaWordIndexEntry other) {
if (other == null) return true; if (other == null) return true;
@ -439,7 +303,7 @@ public final class plasmaWordIndexEntry implements Cloneable {
public static void main(String[] args) { public static void main(String[] args) {
// outputs the word hash to a given word // outputs the word hash to a given word
if (args.length != 1) System.exit(0); if (args.length != 1) System.exit(0);
System.out.println("WORDHASH: " + word2hash(args[0])); System.out.println("WORDHASH: " + indexEntryAttribute.word2hash(args[0]));
} }
} }

@ -53,6 +53,7 @@ import java.util.Iterator;
import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterContentScraper;
import de.anomic.http.httpc; import de.anomic.http.httpc;
import de.anomic.index.indexEntryAttribute;
import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaCrawlLURL;
import de.anomic.plasma.plasmaSearchRankingProfile; import de.anomic.plasma.plasmaSearchRankingProfile;
@ -464,10 +465,10 @@ public final class yacyClient {
//System.out.println("***result count " + results); //System.out.println("***result count " + results);
// create containers // create containers
final int words = wordhashes.length() / plasmaWordIndexEntry.wordHashLength; final int words = wordhashes.length() / indexEntryAttribute.wordHashLength;
plasmaWordIndexEntryContainer[] container = new plasmaWordIndexEntryContainer[words]; plasmaWordIndexEntryContainer[] container = new plasmaWordIndexEntryContainer[words];
for (int i = 0; i < words; i++) { for (int i = 0; i < words; i++) {
container[i] = new plasmaWordIndexEntryContainer(wordhashes.substring(i * plasmaWordIndexEntry.wordHashLength, (i + 1) * plasmaWordIndexEntry.wordHashLength)); container[i] = new plasmaWordIndexEntryContainer(wordhashes.substring(i * indexEntryAttribute.wordHashLength, (i + 1) * indexEntryAttribute.wordHashLength));
} }
// insert results to containers // insert results to containers
@ -1093,7 +1094,7 @@ public final class yacyClient {
/*final yacyCore core =*/ new yacyCore(sb); /*final yacyCore core =*/ new yacyCore(sb);
yacyCore.peerActions.loadSeedLists(); yacyCore.peerActions.loadSeedLists();
final yacySeed target = yacyCore.seedDB.getConnected(args[1]); final yacySeed target = yacyCore.seedDB.getConnected(args[1]);
final String wordhashe = plasmaWordIndexEntry.word2hash("test"); final String wordhashe = indexEntryAttribute.word2hash("test");
//System.out.println("permission=" + permissionMessage(args[1])); //System.out.println("permission=" + permissionMessage(args[1]));
// should we use the proxy? // should we use the proxy?

@ -72,6 +72,7 @@ import de.anomic.http.httpd;
import de.anomic.http.httpdFileHandler; import de.anomic.http.httpdFileHandler;
import de.anomic.http.httpdProxyHandler; import de.anomic.http.httpdProxyHandler;
import de.anomic.http.httpc.response; import de.anomic.http.httpc.response;
import de.anomic.index.indexEntryAttribute;
import de.anomic.kelondro.kelondroDyn; import de.anomic.kelondro.kelondroDyn;
import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroMScoreCluster;
import de.anomic.kelondro.kelondroMap; import de.anomic.kelondro.kelondroMap;
@ -1035,7 +1036,7 @@ public final class yacy {
try { try {
String word; String word;
BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(wordlist))); BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(wordlist)));
while ((word = br.readLine()) != null) wordmap.put(plasmaWordIndexEntry.word2hash(word),word); while ((word = br.readLine()) != null) wordmap.put(indexEntryAttribute.word2hash(word),word);
br.close(); br.close();
} catch (IOException e) {} } catch (IOException e) {}
return wordmap; return wordmap;
@ -1140,7 +1141,7 @@ public final class yacy {
Iterator i = stopwords.iterator(); Iterator i = stopwords.iterator();
while (i.hasNext()) { while (i.hasNext()) {
w = (String) i.next(); w = (String) i.next();
f = plasmaWordIndexEntity.wordHash2path(dbRoot, plasmaWordIndexEntry.word2hash(w)); f = plasmaWordIndexEntity.wordHash2path(dbRoot, indexEntryAttribute.word2hash(w));
if (f.exists()) { if (f.exists()) {
thisamount = f.length(); thisamount = f.length();
if (f.delete()) { if (f.delete()) {

Loading…
Cancel
Save