diff --git a/htroot/IndexControl_p.java b/htroot/IndexControl_p.java index 1c5ec3b3c..f0fbf3b14 100644 --- a/htroot/IndexControl_p.java +++ b/htroot/IndexControl_p.java @@ -57,6 +57,7 @@ import java.util.TreeMap; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.http.httpHeader; +import de.anomic.index.indexEntryAttribute; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaURL; @@ -167,7 +168,7 @@ public class IndexControl_p { switchboard.wordIndex.deleteIndex(keyhash); post.remove("keyhashdeleteall"); if (keystring.length() > 0 && - plasmaWordIndexEntry.word2hash(keystring).equals(keyhash)) { + indexEntryAttribute.word2hash(keystring).equals(keyhash)) { post.put("keystringsearch", "generated"); } else { post.put("keyhashsearch", "generated"); @@ -189,7 +190,7 @@ public class IndexControl_p { // thinks that it was called for a list presentation post.remove("keyhashdelete"); if (keystring.length() > 0 && - plasmaWordIndexEntry.word2hash(keystring).equals(keyhash)) { + indexEntryAttribute.word2hash(keystring).equals(keyhash)) { post.put("keystringsearch", "generated"); } else { post.put("keyhashsearch", "generated"); @@ -220,7 +221,7 @@ public class IndexControl_p { } if (post.containsKey("keystringsearch")) { - keyhash = plasmaWordIndexEntry.word2hash(keystring); + keyhash = indexEntryAttribute.word2hash(keystring); prop.put("keyhash", keyhash); prop.put("urlstring", ""); prop.put("urlhash", ""); @@ -229,7 +230,7 @@ public class IndexControl_p { if (post.containsKey("keyhashsearch")) { if (keystring.length() == 0 || - !plasmaWordIndexEntry.word2hash(keystring).equals(keyhash)) { + !indexEntryAttribute.word2hash(keystring).equals(keyhash)) { prop.put("keystring", ""); } prop.put("urlstring", ""); @@ -240,7 +241,7 @@ public class IndexControl_p { // transfer to other peer if (post.containsKey("keyhashtransfer")) { if (keystring.length() == 0 || - !plasmaWordIndexEntry.word2hash(keystring).equals(keyhash)) { + !indexEntryAttribute.word2hash(keystring).equals(keyhash)) { prop.put("keystring", ""); } prop.put("urlstring", ""); diff --git a/htroot/htdocsdefault/dir.java b/htroot/htdocsdefault/dir.java index 98bc19e26..06a41607e 100644 --- a/htroot/htdocsdefault/dir.java +++ b/htroot/htdocsdefault/dir.java @@ -58,12 +58,12 @@ import java.util.Map; import java.net.InetAddress; import java.net.UnknownHostException; import de.anomic.http.httpHeader; +import de.anomic.index.indexEntryAttribute; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.plasma.plasmaCondenser; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSwitchboard; import de.anomic.plasma.plasmaURL; -import de.anomic.plasma.plasmaWordIndexEntry; import de.anomic.server.serverCodings; import de.anomic.server.serverCore; import de.anomic.server.serverFileUtils; @@ -463,7 +463,7 @@ public class dir { false, /*localneed*/ condenser.RESULT_WORD_ENTROPHY, "**", /*language*/ - plasmaWordIndexEntry.DT_SHARE, /*doctype*/ + indexEntryAttribute.DT_SHARE, /*doctype*/ phrase.length(), /*size*/ condenser.RESULT_NUMB_WORDS ); @@ -476,7 +476,7 @@ public class dir { ); final String urlHash = newEntry.hash(); - /*final int words =*/ switchboard.wordIndex.addPageIndex(url, urlHash, new Date(), phrase.length() + descr.length() + 13, null, condenser, "**", plasmaWordIndexEntry.DT_SHARE, 0, 0); + /*final int words =*/ switchboard.wordIndex.addPageIndex(url, urlHash, new Date(), phrase.length() + descr.length() + 13, null, condenser, "**", indexEntryAttribute.DT_SHARE, 0, 0); } catch (IOException e) {} } @@ -487,7 +487,7 @@ public class dir { Map.Entry entry; while (words.hasNext()) { entry = (Map.Entry) words.next(); - switchboard.wordIndex.removeEntries(plasmaWordIndexEntry.word2hash((String) entry.getKey()), new String[] {urlhash}, true); + switchboard.wordIndex.removeEntries(indexEntryAttribute.word2hash((String) entry.getKey()), new String[] {urlhash}, true); } switchboard.urlPool.loadedURL.remove(urlhash); } catch (Exception e) { diff --git a/htroot/yacy/search.java b/htroot/yacy/search.java index 956885e31..19d6e8531 100644 --- a/htroot/yacy/search.java +++ b/htroot/yacy/search.java @@ -49,6 +49,7 @@ import java.util.HashSet; import de.anomic.http.httpHeader; +import de.anomic.index.indexEntryAttribute; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSearchEvent; import de.anomic.plasma.plasmaSearchRankingProfile; @@ -56,7 +57,6 @@ import de.anomic.plasma.plasmaSearchResult; import de.anomic.plasma.plasmaSearchTimingProfile; import de.anomic.plasma.plasmaSnippetCache; import de.anomic.plasma.plasmaSwitchboard; -import de.anomic.plasma.plasmaWordIndexEntry; import de.anomic.plasma.plasmaSearchQuery; import de.anomic.server.serverCore; import de.anomic.server.serverObjects; @@ -100,9 +100,9 @@ public final class search { } // prepare search - final HashSet keyhashes = new HashSet(query.length() / plasmaWordIndexEntry.wordHashLength); - for (int i = 0; i < (query.length() / plasmaWordIndexEntry.wordHashLength); i++) { - keyhashes.add(query.substring(i * plasmaWordIndexEntry.wordHashLength, (i + 1) * plasmaWordIndexEntry.wordHashLength)); + final HashSet keyhashes = new HashSet(query.length() / indexEntryAttribute.wordHashLength); + for (int i = 0; i < (query.length() / indexEntryAttribute.wordHashLength); i++) { + keyhashes.add(query.substring(i * indexEntryAttribute.wordHashLength, (i + 1) * indexEntryAttribute.wordHashLength)); } final long timestamp = System.currentTimeMillis(); diff --git a/source/de/anomic/data/bookmarksDB.java b/source/de/anomic/data/bookmarksDB.java index f2cd64108..b7d6334ee 100644 --- a/source/de/anomic/data/bookmarksDB.java +++ b/source/de/anomic/data/bookmarksDB.java @@ -69,11 +69,11 @@ import org.w3c.dom.NodeList; import org.xml.sax.SAXException; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.htmlFilter.htmlFilterOutputStream; +import de.anomic.index.indexEntryAttribute; import de.anomic.kelondro.kelondroDyn; import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroMap; import de.anomic.plasma.plasmaURL; -import de.anomic.plasma.plasmaWordIndexEntry; import de.anomic.server.serverFileUtils; import de.anomic.server.logging.serverLog; @@ -85,7 +85,7 @@ public class bookmarksDB { HashMap bookmarkCache; public static String tagHash(String tagName){ - return plasmaWordIndexEntry.word2hash(tagName.toLowerCase()); + return indexEntryAttribute.word2hash(tagName.toLowerCase()); } public static String dateToiso8601(Date date){ return new SimpleDateFormat("yyyy-MM-dd").format(date)+"T"+(new SimpleDateFormat("HH:mm:ss")).format(date)+"Z"; diff --git a/source/de/anomic/index/indexEntryAttribute.java b/source/de/anomic/index/indexEntryAttribute.java new file mode 100644 index 000000000..f77ef3d89 --- /dev/null +++ b/source/de/anomic/index/indexEntryAttribute.java @@ -0,0 +1,172 @@ +// indexEntryAttribute.java +// (C) 2006 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany +// first published 16.05.2006 on http://www.anomic.de +// +// This is a part of YaCy, a peer-to-peer based web search engine +// +// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $ +// $LastChangedRevision: 1986 $ +// $LastChangedBy: orbiter $ +// +// LICENSE +// +// This program is free software; you can redistribute it and/or modify +// it under the terms of the GNU General Public License as published by +// the Free Software Foundation; either version 2 of the License, or +// (at your option) any later version. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public License for more details. +// +// You should have received a copy of the GNU General Public License +// along with this program; if not, write to the Free Software +// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + + + +package de.anomic.index; + +import java.net.URL; + +import de.anomic.htmlFilter.htmlFilterContentScraper; +import de.anomic.kelondro.kelondroBase64Order; +import de.anomic.server.serverCodings; +import de.anomic.yacy.yacySeedDB; + +public class indexEntryAttribute { + + // the size of a word hash + public static final int wordHashLength = yacySeedDB.commonHashLength; // 12 + public static final int urlHashLength = yacySeedDB.commonHashLength; // 12 + + // doctypes: + public static final char DT_PDFPS = 'p'; + public static final char DT_TEXT = 't'; + public static final char DT_HTML = 'h'; + public static final char DT_DOC = 'd'; + public static final char DT_IMAGE = 'i'; + public static final char DT_MOVIE = 'm'; + public static final char DT_FLASH = 'f'; + public static final char DT_SHARE = 's'; + public static final char DT_AUDIO = 'a'; + public static final char DT_BINARY = 'b'; + public static final char DT_UNKNOWN = 'u'; + + // appearance locations: (used for flags) + public static final int AP_TITLE = 0; // title tag from html header + public static final int AP_H1 = 1; // headline - top level + public static final int AP_H2 = 2; // headline, second level + public static final int AP_H3 = 3; // headline, 3rd level + public static final int AP_H4 = 4; // headline, 4th level + public static final int AP_H5 = 5; // headline, 5th level + public static final int AP_H6 = 6; // headline, 6th level + public static final int AP_TEXT = 7; // word appears in text (used to check validation of other appearances against spam) + public static final int AP_DOM = 8; // word inside an url: in Domain + public static final int AP_PATH = 9; // word inside an url: in path + public static final int AP_IMG = 10; // tag inside image references + public static final int AP_ANCHOR = 11; // anchor description + public static final int AP_ENV = 12; // word appears in environment (similar to anchor appearance) + public static final int AP_BOLD = 13; // may be interpreted as emphasized + public static final int AP_ITALICS = 14; // may be interpreted as emphasized + public static final int AP_WEAK = 15; // for Text that is small or bareley visible + public static final int AP_INVISIBLE = 16; // good for spam detection + public static final int AP_TAG = 17; // for tagged indexeing (i.e. using mp3 tags) + public static final int AP_AUTHOR = 18; // word appears in author name + public static final int AP_OPUS = 19; // word appears in name of opus, which may be an album name (in mp3 tags) + public static final int AP_TRACK = 20; // word appears in track name (i.e. in mp3 tags) + + // URL attributes + public static final int UA_LOCAL = 0; // URL was crawled locally + public static final int UA_TILDE = 1; // tilde appears in URL + public static final int UA_REDIRECT = 2; // The URL is a redirection + + // local flag attributes + public static final char LT_LOCAL = 'L'; + public static final char LT_GLOBAL = 'G'; + + // create a word hash + public static String word2hash(String word) { + return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase())).substring(0, indexEntryAttribute.wordHashLength); + } + + // doctype calculation + public static char docType(URL url) { + String path = htmlFilterContentScraper.urlNormalform(url); + // serverLog.logFinest("PLASMA", "docType URL=" + path); + char doctype = doctype = indexEntryAttribute.DT_UNKNOWN; + if (path.endsWith(".gif")) { doctype = indexEntryAttribute.DT_IMAGE; } + else if (path.endsWith(".jpg")) { doctype = indexEntryAttribute.DT_IMAGE; } + else if (path.endsWith(".jpeg")) { doctype = indexEntryAttribute.DT_IMAGE; } + else if (path.endsWith(".png")) { doctype = indexEntryAttribute.DT_IMAGE; } + else if (path.endsWith(".html")) { doctype = indexEntryAttribute.DT_HTML; } + else if (path.endsWith(".txt")) { doctype = indexEntryAttribute.DT_TEXT; } + else if (path.endsWith(".doc")) { doctype = indexEntryAttribute.DT_DOC; } + else if (path.endsWith(".rtf")) { doctype = indexEntryAttribute.DT_DOC; } + else if (path.endsWith(".pdf")) { doctype = indexEntryAttribute.DT_PDFPS; } + else if (path.endsWith(".ps")) { doctype = indexEntryAttribute.DT_PDFPS; } + else if (path.endsWith(".avi")) { doctype = indexEntryAttribute.DT_MOVIE; } + else if (path.endsWith(".mov")) { doctype = indexEntryAttribute.DT_MOVIE; } + else if (path.endsWith(".qt")) { doctype = indexEntryAttribute.DT_MOVIE; } + else if (path.endsWith(".mpg")) { doctype = indexEntryAttribute.DT_MOVIE; } + else if (path.endsWith(".md5")) { doctype = indexEntryAttribute.DT_SHARE; } + else if (path.endsWith(".mpeg")) { doctype = indexEntryAttribute.DT_MOVIE; } + else if (path.endsWith(".asf")) { doctype = indexEntryAttribute.DT_FLASH; } + return doctype; + } + + public static char docType(String mime) { + // serverLog.logFinest("PLASMA", "docType mime=" + mime); + char doctype = indexEntryAttribute.DT_UNKNOWN; + if (mime == null) doctype = indexEntryAttribute.DT_UNKNOWN; + else if (mime.startsWith("image/")) doctype = indexEntryAttribute.DT_IMAGE; + else if (mime.endsWith("/gif")) doctype = indexEntryAttribute.DT_IMAGE; + else if (mime.endsWith("/jpeg")) doctype = indexEntryAttribute.DT_IMAGE; + else if (mime.endsWith("/png")) doctype = indexEntryAttribute.DT_IMAGE; + else if (mime.endsWith("/html")) doctype = indexEntryAttribute.DT_HTML; + else if (mime.endsWith("/rtf")) doctype = indexEntryAttribute.DT_DOC; + else if (mime.endsWith("/pdf")) doctype = indexEntryAttribute.DT_PDFPS; + else if (mime.endsWith("/octet-stream")) doctype = indexEntryAttribute.DT_BINARY; + else if (mime.endsWith("/x-shockwave-flash")) doctype = indexEntryAttribute.DT_FLASH; + else if (mime.endsWith("/msword")) doctype = indexEntryAttribute.DT_DOC; + else if (mime.endsWith("/mspowerpoint")) doctype = indexEntryAttribute.DT_DOC; + else if (mime.endsWith("/postscript")) doctype = indexEntryAttribute.DT_PDFPS; + else if (mime.startsWith("text/")) doctype = indexEntryAttribute.DT_TEXT; + else if (mime.startsWith("image/")) doctype = indexEntryAttribute.DT_IMAGE; + else if (mime.startsWith("audio/")) doctype = indexEntryAttribute.DT_AUDIO; + else if (mime.startsWith("video/")) doctype = indexEntryAttribute.DT_MOVIE; + //bz2 = application/x-bzip2 + //dvi = application/x-dvi + //gz = application/gzip + //hqx = application/mac-binhex40 + //lha = application/x-lzh + //lzh = application/x-lzh + //pac = application/x-ns-proxy-autoconfig + //php = application/x-httpd-php + //phtml = application/x-httpd-php + //rss = application/xml + //tar = application/tar + //tex = application/x-tex + //tgz = application/tar + //torrent = application/x-bittorrent + //xhtml = application/xhtml+xml + //xla = application/msexcel + //xls = application/msexcel + //xsl = application/xml + //xml = application/xml + //Z = application/x-compress + //zip = application/zip + return doctype; + } + + // language calculation + public static String language(URL url) { + String language = "uk"; + String host = url.getHost(); + int pos = host.lastIndexOf("."); + if ((pos > 0) && (host.length() - pos == 3)) language = host.substring(pos + 1).toLowerCase(); + return language; + } + +} diff --git a/source/de/anomic/plasma/plasmaHTCache.java b/source/de/anomic/plasma/plasmaHTCache.java index 4ad24dabb..2b3f4f1bb 100644 --- a/source/de/anomic/plasma/plasmaHTCache.java +++ b/source/de/anomic/plasma/plasmaHTCache.java @@ -56,6 +56,7 @@ package de.anomic.plasma; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.http.httpc; import de.anomic.http.httpHeader; +import de.anomic.index.indexEntryAttribute; import de.anomic.kelondro.kelondroDyn; import de.anomic.kelondro.kelondroMap; import de.anomic.kelondro.kelondroMScoreCluster; @@ -761,9 +762,9 @@ public final class plasmaHTCache { this.lastModified = responseHeader.lastModified(); if (this.lastModified == null) this.lastModified = new Date(serverDate.correctedUTCTime()); // does not exist in header } - this.doctype = plasmaWordIndexEntry.docType(responseHeader.mime()); - if (this.doctype == plasmaWordIndexEntry.DT_UNKNOWN) this.doctype = plasmaWordIndexEntry.docType(url); - this.language = plasmaWordIndexEntry.language(url); + this.doctype = indexEntryAttribute.docType(responseHeader.mime()); + if (this.doctype == indexEntryAttribute.DT_UNKNOWN) this.doctype = indexEntryAttribute.docType(url); + this.language = indexEntryAttribute.language(url); // to be defined later: this.cacheArray = null; diff --git a/source/de/anomic/plasma/plasmaSearchQuery.java b/source/de/anomic/plasma/plasmaSearchQuery.java index 0f68e887b..d6077c2c2 100644 --- a/source/de/anomic/plasma/plasmaSearchQuery.java +++ b/source/de/anomic/plasma/plasmaSearchQuery.java @@ -47,6 +47,7 @@ import java.util.TreeSet; import java.util.Iterator; import de.anomic.htmlFilter.htmlFilterAbstractScraper; +import de.anomic.index.indexEntryAttribute; import de.anomic.kelondro.kelondroNaturalOrder; import de.anomic.server.serverByteBuffer; @@ -100,14 +101,14 @@ public final class plasmaSearchQuery { public static Set words2hashes(String[] words) { TreeSet hashes = new TreeSet(); - for (int i = 0; i < words.length; i++) hashes.add(plasmaWordIndexEntry.word2hash(words[i])); + for (int i = 0; i < words.length; i++) hashes.add(indexEntryAttribute.word2hash(words[i])); return hashes; } public static Set words2hashes(Set words) { Iterator i = words.iterator(); TreeSet hashes = new TreeSet(); - while (i.hasNext()) hashes.add(plasmaWordIndexEntry.word2hash((String) i.next())); + while (i.hasNext()) hashes.add(indexEntryAttribute.word2hash((String) i.next())); return hashes; } diff --git a/source/de/anomic/plasma/plasmaSearchResult.java b/source/de/anomic/plasma/plasmaSearchResult.java index 2734e866f..a8353db42 100644 --- a/source/de/anomic/plasma/plasmaSearchResult.java +++ b/source/de/anomic/plasma/plasmaSearchResult.java @@ -55,6 +55,7 @@ import java.net.MalformedURLException; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.server.serverCodings; import de.anomic.htmlFilter.htmlFilterContentScraper; +import de.anomic.index.indexEntryAttribute; public final class plasmaSearchResult { @@ -249,7 +250,7 @@ public final class plasmaSearchResult { word = words[i].toLowerCase(); if ((word.length() > 2) && ("http_html_php_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0) && - (!(query.queryHashes.contains(plasmaWordIndexEntry.word2hash(word))))) + (!(query.queryHashes.contains(indexEntryAttribute.word2hash(word))))) ref.incScore(word); } } diff --git a/source/de/anomic/plasma/plasmaSnippetCache.java b/source/de/anomic/plasma/plasmaSnippetCache.java index ce1b1c90b..26c1b54cc 100644 --- a/source/de/anomic/plasma/plasmaSnippetCache.java +++ b/source/de/anomic/plasma/plasmaSnippetCache.java @@ -55,6 +55,7 @@ import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.server.logging.serverLog; import de.anomic.yacy.yacySearch; import de.anomic.htmlFilter.htmlFilterContentScraper; +import de.anomic.index.indexEntryAttribute; public class plasmaSnippetCache { @@ -125,7 +126,7 @@ public class plasmaSnippetCache { while (i.hasNext()) { h = (String) i.next(); for (int j = 0; j < w.length; j++) { - if (plasmaWordIndexEntry.word2hash(w[j]).equals(h)) w[j] = "" + w[j] + ""; + if (indexEntryAttribute.word2hash(w[j]).equals(h)) w[j] = "" + w[j] + ""; } } StringBuffer l = new StringBuffer(line.length() + queryHashes.size() * 8); @@ -347,7 +348,7 @@ public class plasmaSnippetCache { String word; while (words.hasMoreElements()) { word = (String) words.nextElement(); - map.put(plasmaWordIndexEntry.word2hash(word), new Integer(pos)); + map.put(indexEntryAttribute.word2hash(word), new Integer(pos)); pos += word.length() + 1; } return map; diff --git a/source/de/anomic/plasma/plasmaSwitchboard.java b/source/de/anomic/plasma/plasmaSwitchboard.java index e1e329ecf..6204be613 100644 --- a/source/de/anomic/plasma/plasmaSwitchboard.java +++ b/source/de/anomic/plasma/plasmaSwitchboard.java @@ -130,6 +130,7 @@ import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.http.httpHeader; import de.anomic.http.httpRemoteProxyConfig; import de.anomic.http.httpc; +import de.anomic.index.indexEntryAttribute; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroMSetTools; @@ -1422,8 +1423,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser referrerHash, 0, true, condenser.RESULT_WORD_ENTROPHY, - plasmaWordIndexEntry.language(entry.url()), - plasmaWordIndexEntry.docType(document.getMimeType()), + indexEntryAttribute.language(entry.url()), + indexEntryAttribute.docType(document.getMimeType()), (int) entry.size(), condenser.RESULT_NUMB_WORDS ); @@ -1451,14 +1452,14 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser (storagePeerHash.trim().length() == 0) || ((seed = yacyCore.seedDB.getConnected(storagePeerHash))==null)){ words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), document, condenser, - plasmaWordIndexEntry.language(entry.url()), plasmaWordIndexEntry.docType(document.getMimeType()), + indexEntryAttribute.language(entry.url()), indexEntryAttribute.docType(document.getMimeType()), ioLinks[0].intValue(), ioLinks[1].intValue()); } else { HashMap urlCache = new HashMap(1); urlCache.put(newEntry.hash(),newEntry); ArrayList tmpContainers = new ArrayList(condenser.RESULT_SIMI_WORDS); - String language = plasmaWordIndexEntry.language(entry.url()); - char doctype = plasmaWordIndexEntry.docType(document.getMimeType()); + String language = indexEntryAttribute.language(entry.url()); + char doctype = indexEntryAttribute.docType(document.getMimeType()); int urlLength = newEntry.url().toString().length(); int urlComps = htmlFilterContentScraper.urlComps(newEntry.url().toString()).length; @@ -1470,7 +1471,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser wentry = (Map.Entry) i.next(); String word = (String) wentry.getKey(); wordStat = (plasmaCondenser.wordStatProp) wentry.getValue(); - String wordHash = plasmaWordIndexEntry.word2hash(word); + String wordHash = indexEntryAttribute.word2hash(word); plasmaWordIndexEntryContainer wordIdxContainer = new plasmaWordIndexEntryContainer(wordHash); plasmaWordIndexEntry wordIdxEntry = new plasmaWordIndexEntry(urlHash, urlLength, urlComps, @@ -1509,8 +1510,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser if (error != null) { words = wordIndex.addPageIndex(entry.url(), urlHash, docDate, (int) entry.size(), document, condenser, - plasmaWordIndexEntry.language(entry.url()), - plasmaWordIndexEntry.docType(document.getMimeType()), + indexEntryAttribute.language(entry.url()), + indexEntryAttribute.docType(document.getMimeType()), ioLinks[0].intValue(), ioLinks[1].intValue()); } @@ -1990,7 +1991,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser while (iter.hasNext()) { word = (String) iter.next(); // delete the URL reference in this word index - count += wordIndex.removeEntries(plasmaWordIndexEntry.word2hash(word), urlEntries, true); + count += wordIndex.removeEntries(indexEntryAttribute.word2hash(word), urlEntries, true); } return count; } @@ -2006,7 +2007,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser entry = (Map.Entry) wordStatPropIterator.next(); word = (String) entry.getKey(); // delete the URL reference in this word index - count += wordIndex.removeEntries(plasmaWordIndexEntry.word2hash(word), urlEntries, true); + count += wordIndex.removeEntries(indexEntryAttribute.word2hash(word), urlEntries, true); } return count; } diff --git a/source/de/anomic/plasma/plasmaWordIndex.java b/source/de/anomic/plasma/plasmaWordIndex.java index 46d3af383..71c59c209 100644 --- a/source/de/anomic/plasma/plasmaWordIndex.java +++ b/source/de/anomic/plasma/plasmaWordIndex.java @@ -58,6 +58,7 @@ import java.util.TreeSet; import java.net.URL; import de.anomic.htmlFilter.htmlFilterContentScraper; +import de.anomic.index.indexEntryAttribute; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroMergeIterator; @@ -247,7 +248,7 @@ public final class plasmaWordIndex { word = (String) wentry.getKey(); wprop = (plasmaCondenser.wordStatProp) wentry.getValue(); // if ((s.length() > 4) && (c > 1)) System.out.println("# " + s + ":" + c); - wordHash = plasmaWordIndexEntry.word2hash(word); + wordHash = indexEntryAttribute.word2hash(word); ientry = new plasmaWordIndexEntry(urlHash, urlLength, urlComps, (document == null) ? urlLength : document.longTitle.length(), wprop.count, diff --git a/source/de/anomic/plasma/plasmaWordIndexAssortment.java b/source/de/anomic/plasma/plasmaWordIndexAssortment.java index 9dd8eb7d0..2dd481a51 100644 --- a/source/de/anomic/plasma/plasmaWordIndexAssortment.java +++ b/source/de/anomic/plasma/plasmaWordIndexAssortment.java @@ -56,6 +56,7 @@ import java.io.File; import java.io.IOException; import java.util.Iterator; +import de.anomic.index.indexEntryAttribute; import de.anomic.kelondro.kelondroException; import de.anomic.kelondro.kelondroRecords; import de.anomic.kelondro.kelondroTree; @@ -66,10 +67,10 @@ public final class plasmaWordIndexAssortment { // environment constants private static final String assortmentFileName = "indexAssortment"; public static final int[] bufferStructureBasis = new int[]{ - plasmaWordIndexEntry.wordHashLength, // a wordHash + indexEntryAttribute.wordHashLength, // a wordHash 4, // occurrence counter 8, // timestamp of last access - plasmaWordIndexEntry.urlHashLength, // corresponding URL hash + indexEntryAttribute.urlHashLength, // corresponding URL hash plasmaWordIndexEntry.attrSpace // URL attributes }; diff --git a/source/de/anomic/plasma/plasmaWordIndexEntry.java b/source/de/anomic/plasma/plasmaWordIndexEntry.java index 60c49b677..0f2834269 100644 --- a/source/de/anomic/plasma/plasmaWordIndexEntry.java +++ b/source/de/anomic/plasma/plasmaWordIndexEntry.java @@ -48,13 +48,9 @@ package de.anomic.plasma; -import java.net.URL; import java.util.Properties; -import de.anomic.htmlFilter.htmlFilterContentScraper; +import de.anomic.index.indexEntryAttribute; import de.anomic.kelondro.kelondroBase64Order; -import de.anomic.server.serverCodings; -import de.anomic.yacy.yacySeedDB; -// import de.anomic.server.logging.serverLog; public final class plasmaWordIndexEntry implements Cloneable { @@ -62,10 +58,6 @@ public final class plasmaWordIndexEntry implements Cloneable { // by the discrete values of the entry // or by the encoded entry-string - // the size of a word hash - public static final int wordHashLength = yacySeedDB.commonHashLength; // 12 - public static final int urlHashLength = yacySeedDB.commonHashLength; // 12 - // the size of the index entry attributes public static final int attrSpace = 24; @@ -86,134 +78,6 @@ public final class plasmaWordIndexEntry implements Cloneable { private char doctype; // type of source private char localflag; // indicates if the index was created locally - // doctypes: - public static final char DT_PDFPS = 'p'; - public static final char DT_TEXT = 't'; - public static final char DT_HTML = 'h'; - public static final char DT_DOC = 'd'; - public static final char DT_IMAGE = 'i'; - public static final char DT_MOVIE = 'm'; - public static final char DT_FLASH = 'f'; - public static final char DT_SHARE = 's'; - public static final char DT_AUDIO = 'a'; - public static final char DT_BINARY = 'b'; - public static final char DT_UNKNOWN = 'u'; - - // appearance locations: (used for flags) - public static final int AP_TITLE = 0; // title tag from html header - public static final int AP_H1 = 1; // headline - top level - public static final int AP_H2 = 2; // headline, second level - public static final int AP_H3 = 3; // headline, 3rd level - public static final int AP_H4 = 4; // headline, 4th level - public static final int AP_H5 = 5; // headline, 5th level - public static final int AP_H6 = 6; // headline, 6th level - public static final int AP_TEXT = 7; // word appears in text (used to check validation of other appearances against spam) - public static final int AP_DOM = 8; // word inside an url: in Domain - public static final int AP_PATH = 9; // word inside an url: in path - public static final int AP_IMG = 10; // tag inside image references - public static final int AP_ANCHOR = 11; // anchor description - public static final int AP_ENV = 12; // word appears in environment (similar to anchor appearance) - public static final int AP_BOLD = 13; // may be interpreted as emphasized - public static final int AP_ITALICS = 14; // may be interpreted as emphasized - public static final int AP_WEAK = 15; // for Text that is small or bareley visible - public static final int AP_INVISIBLE = 16; // good for spam detection - public static final int AP_TAG = 17; // for tagged indexeing (i.e. using mp3 tags) - public static final int AP_AUTHOR = 18; // word appears in author name - public static final int AP_OPUS = 19; // word appears in name of opus, which may be an album name (in mp3 tags) - public static final int AP_TRACK = 20; // word appears in track name (i.e. in mp3 tags) - - // URL attributes - public static final int UA_LOCAL = 0; // URL was crawled locally - public static final int UA_TILDE = 1; // tilde appears in URL - public static final int UA_REDIRECT = 2; // The URL is a redirection - - // local flag attributes - public static final char LT_LOCAL = 'L'; - public static final char LT_GLOBAL = 'G'; - - // create a word hash - public static String word2hash(String word) { - return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase())).substring(0, wordHashLength); - } - - // doctype calculation - public static char docType(URL url) { - String path = htmlFilterContentScraper.urlNormalform(url); - // serverLog.logFinest("PLASMA", "docType URL=" + path); - char doctype = doctype = DT_UNKNOWN; - if (path.endsWith(".gif")) { doctype = DT_IMAGE; } - else if (path.endsWith(".jpg")) { doctype = DT_IMAGE; } - else if (path.endsWith(".jpeg")) { doctype = DT_IMAGE; } - else if (path.endsWith(".png")) { doctype = DT_IMAGE; } - else if (path.endsWith(".html")) { doctype = DT_HTML; } - else if (path.endsWith(".txt")) { doctype = DT_TEXT; } - else if (path.endsWith(".doc")) { doctype = DT_DOC; } - else if (path.endsWith(".rtf")) { doctype = DT_DOC; } - else if (path.endsWith(".pdf")) { doctype = DT_PDFPS; } - else if (path.endsWith(".ps")) { doctype = DT_PDFPS; } - else if (path.endsWith(".avi")) { doctype = DT_MOVIE; } - else if (path.endsWith(".mov")) { doctype = DT_MOVIE; } - else if (path.endsWith(".qt")) { doctype = DT_MOVIE; } - else if (path.endsWith(".mpg")) { doctype = DT_MOVIE; } - else if (path.endsWith(".md5")) { doctype = DT_SHARE; } - else if (path.endsWith(".mpeg")) { doctype = DT_MOVIE; } - else if (path.endsWith(".asf")) { doctype = DT_FLASH; } - return doctype; - } - - public static char docType(String mime) { - // serverLog.logFinest("PLASMA", "docType mime=" + mime); - char doctype = DT_UNKNOWN; - if (mime == null) doctype = DT_UNKNOWN; - else if (mime.startsWith("image/")) doctype = DT_IMAGE; - else if (mime.endsWith("/gif")) doctype = DT_IMAGE; - else if (mime.endsWith("/jpeg")) doctype = DT_IMAGE; - else if (mime.endsWith("/png")) doctype = DT_IMAGE; - else if (mime.endsWith("/html")) doctype = DT_HTML; - else if (mime.endsWith("/rtf")) doctype = DT_DOC; - else if (mime.endsWith("/pdf")) doctype = DT_PDFPS; - else if (mime.endsWith("/octet-stream")) doctype = DT_BINARY; - else if (mime.endsWith("/x-shockwave-flash")) doctype = DT_FLASH; - else if (mime.endsWith("/msword")) doctype = DT_DOC; - else if (mime.endsWith("/mspowerpoint")) doctype = DT_DOC; - else if (mime.endsWith("/postscript")) doctype = DT_PDFPS; - else if (mime.startsWith("text/")) doctype = DT_TEXT; - else if (mime.startsWith("image/")) doctype = DT_IMAGE; - else if (mime.startsWith("audio/")) doctype = DT_AUDIO; - else if (mime.startsWith("video/")) doctype = DT_MOVIE; - //bz2 = application/x-bzip2 - //dvi = application/x-dvi - //gz = application/gzip - //hqx = application/mac-binhex40 - //lha = application/x-lzh - //lzh = application/x-lzh - //pac = application/x-ns-proxy-autoconfig - //php = application/x-httpd-php - //phtml = application/x-httpd-php - //rss = application/xml - //tar = application/tar - //tex = application/x-tex - //tgz = application/tar - //torrent = application/x-bittorrent - //xhtml = application/xhtml+xml - //xla = application/msexcel - //xls = application/msexcel - //xsl = application/xml - //xml = application/xml - //Z = application/x-compress - //zip = application/zip - return doctype; - } - - // language calculation - public static String language(URL url) { - String language = "uk"; - String host = url.getHost(); - int pos = host.lastIndexOf("."); - if ((pos > 0) && (host.length() - pos == 3)) language = host.substring(pos + 1).toLowerCase(); - return language; - } - // the class instantiation can only be done by a plasmaStore method // therefore they are all public public plasmaWordIndexEntry(String urlHash, @@ -255,7 +119,7 @@ public final class plasmaWordIndexEntry implements Cloneable { this.quality = quality; this.language = language.getBytes(); this.doctype = doctype; - this.localflag = (local) ? LT_LOCAL : LT_GLOBAL; + this.localflag = (local) ? indexEntryAttribute.LT_LOCAL : indexEntryAttribute.LT_GLOBAL; } public plasmaWordIndexEntry(String urlHash, String code) { @@ -299,7 +163,7 @@ public final class plasmaWordIndexEntry implements Cloneable { this.quality = (int) kelondroBase64Order.enhancedCoder.decodeLong(pr.getProperty("q", "__")); this.language = pr.getProperty("l", "uk").getBytes(); this.doctype = pr.getProperty("d", "u").charAt(0); - this.localflag = pr.getProperty("f", ""+LT_LOCAL).charAt(0); + this.localflag = pr.getProperty("f", ""+indexEntryAttribute.LT_LOCAL).charAt(0); } public Object clone() { @@ -412,7 +276,7 @@ public final class plasmaWordIndexEntry implements Cloneable { public int phrasecount() { return phrasecount; } public String getLanguage() { return new String(language); } public char getType() { return doctype; } - public boolean isLocal() { return localflag == LT_LOCAL; } + public boolean isLocal() { return localflag == indexEntryAttribute.LT_LOCAL; } public boolean isNewer(plasmaWordIndexEntry other) { if (other == null) return true; @@ -439,7 +303,7 @@ public final class plasmaWordIndexEntry implements Cloneable { public static void main(String[] args) { // outputs the word hash to a given word if (args.length != 1) System.exit(0); - System.out.println("WORDHASH: " + word2hash(args[0])); + System.out.println("WORDHASH: " + indexEntryAttribute.word2hash(args[0])); } } diff --git a/source/de/anomic/yacy/yacyClient.java b/source/de/anomic/yacy/yacyClient.java index 0ccab03f5..ad972c37b 100644 --- a/source/de/anomic/yacy/yacyClient.java +++ b/source/de/anomic/yacy/yacyClient.java @@ -53,6 +53,7 @@ import java.util.Iterator; import de.anomic.htmlFilter.htmlFilterContentScraper; import de.anomic.http.httpc; +import de.anomic.index.indexEntryAttribute; import de.anomic.kelondro.kelondroBase64Order; import de.anomic.plasma.plasmaCrawlLURL; import de.anomic.plasma.plasmaSearchRankingProfile; @@ -464,10 +465,10 @@ public final class yacyClient { //System.out.println("***result count " + results); // create containers - final int words = wordhashes.length() / plasmaWordIndexEntry.wordHashLength; + final int words = wordhashes.length() / indexEntryAttribute.wordHashLength; plasmaWordIndexEntryContainer[] container = new plasmaWordIndexEntryContainer[words]; for (int i = 0; i < words; i++) { - container[i] = new plasmaWordIndexEntryContainer(wordhashes.substring(i * plasmaWordIndexEntry.wordHashLength, (i + 1) * plasmaWordIndexEntry.wordHashLength)); + container[i] = new plasmaWordIndexEntryContainer(wordhashes.substring(i * indexEntryAttribute.wordHashLength, (i + 1) * indexEntryAttribute.wordHashLength)); } // insert results to containers @@ -1093,7 +1094,7 @@ public final class yacyClient { /*final yacyCore core =*/ new yacyCore(sb); yacyCore.peerActions.loadSeedLists(); final yacySeed target = yacyCore.seedDB.getConnected(args[1]); - final String wordhashe = plasmaWordIndexEntry.word2hash("test"); + final String wordhashe = indexEntryAttribute.word2hash("test"); //System.out.println("permission=" + permissionMessage(args[1])); // should we use the proxy? diff --git a/source/yacy.java b/source/yacy.java index 63d08bc6b..9394c5bc7 100644 --- a/source/yacy.java +++ b/source/yacy.java @@ -72,6 +72,7 @@ import de.anomic.http.httpd; import de.anomic.http.httpdFileHandler; import de.anomic.http.httpdProxyHandler; import de.anomic.http.httpc.response; +import de.anomic.index.indexEntryAttribute; import de.anomic.kelondro.kelondroDyn; import de.anomic.kelondro.kelondroMScoreCluster; import de.anomic.kelondro.kelondroMap; @@ -1035,7 +1036,7 @@ public final class yacy { try { String word; BufferedReader br = new BufferedReader(new InputStreamReader(new FileInputStream(wordlist))); - while ((word = br.readLine()) != null) wordmap.put(plasmaWordIndexEntry.word2hash(word),word); + while ((word = br.readLine()) != null) wordmap.put(indexEntryAttribute.word2hash(word),word); br.close(); } catch (IOException e) {} return wordmap; @@ -1140,7 +1141,7 @@ public final class yacy { Iterator i = stopwords.iterator(); while (i.hasNext()) { w = (String) i.next(); - f = plasmaWordIndexEntity.wordHash2path(dbRoot, plasmaWordIndexEntry.word2hash(w)); + f = plasmaWordIndexEntity.wordHash2path(dbRoot, indexEntryAttribute.word2hash(w)); if (f.exists()) { thisamount = f.length(); if (f.delete()) {