diff --git a/htroot/Bookmarks.html b/htroot/Bookmarks.html index 9cb74d700..a580c0410 100644 --- a/htroot/Bookmarks.html +++ b/htroot/Bookmarks.html @@ -229,7 +229,7 @@ To see a list of all APIs, please visit the Delete / - Info + Info

#{/bookmarks}# diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html index 944bd1405..5d0d55fcc 100644 --- a/htroot/CrawlStartExpert_p.html +++ b/htroot/CrawlStartExpert_p.html @@ -17,6 +17,9 @@ + +
+ #%env/templates/header.template%# #%env/templates/submenuIndexCreate.template%#

Expert Crawl Start

diff --git a/htroot/CrawlStartSite_p.html b/htroot/CrawlStartSite_p.html index 70ab6f426..bdcfc89c7 100644 --- a/htroot/CrawlStartSite_p.html +++ b/htroot/CrawlStartSite_p.html @@ -17,6 +17,9 @@ + +
+ #%env/templates/header.template%# #%env/templates/submenuIndexCreate.template%#

Site Crawling

diff --git a/htroot/ViewFile.html b/htroot/ViewFile.html index 0f4a511f7..6a60aaea8 100644 --- a/htroot/ViewFile.html +++ b/htroot/ViewFile.html @@ -7,6 +7,14 @@ + +
+ +API + +See the page info about the url. +
+ #(display)# #%env/templates/simpleheader.template%# :: diff --git a/htroot/api/util/getpageinfo_p.java b/htroot/api/getpageinfo_p.java similarity index 85% rename from htroot/api/util/getpageinfo_p.java rename to htroot/api/getpageinfo_p.java index 5a101c611..68c490807 100755 --- a/htroot/api/util/getpageinfo_p.java +++ b/htroot/api/getpageinfo_p.java @@ -8,6 +8,7 @@ import net.yacy.cora.protocol.RequestHeader; import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.document.parser.html.ContentScraper; import net.yacy.kelondro.data.meta.DigestURI; +import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; import de.anomic.crawler.RobotsTxtEntry; import de.anomic.server.serverObjects; @@ -24,21 +25,23 @@ public class getpageinfo_p { prop.put("desc", ""); prop.put("lang", ""); prop.put("robots-allowed", "3"); //unknown + prop.put("robotsInfo", ""); //unknown prop.put("sitemap", ""); prop.put("favicon",""); prop.put("sitelist", ""); prop.put("filter", ".*"); // default actions - String actions="title,robots"; + String actions = "title,robots"; if (post != null && post.containsKey("url")) { - if(post.containsKey("actions")) + if (post.containsKey("actions")) actions=post.get("actions"); String url=post.get("url"); - if(url.toLowerCase().startsWith("ftp://")){ + if (url.toLowerCase().startsWith("ftp://")) { prop.put("robots-allowed", "1"); - prop.putXML("title", "FTP: "+url); + prop.put("robotsInfo", "ftp does not follow robots.txt"); + prop.putXML("title", "FTP: " + url); return prop; } else if (!url.startsWith("http://") && !url.startsWith("https://") && @@ -47,18 +50,18 @@ public class getpageinfo_p { !url.startsWith("file://")) { url = "http://" + url; } - if (actions.indexOf("title")>=0) { + if (actions.indexOf("title") >= 0) { DigestURI u = null; try { u = new DigestURI(url); } catch (final MalformedURLException e) { - // fail, do nothing + Log.logException(e); } ContentScraper scraper = null; if (u != null) try { scraper = sb.loader.parseResource(u, CacheStrategy.IFEXIST); } catch (final IOException e) { - // now thats a fail, do nothing + Log.logException(e); } if (scraper != null) { // put the document title @@ -68,9 +71,9 @@ public class getpageinfo_p { prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString()); // put keywords - final String list[]=scraper.getKeywords(); + final String list[] = scraper.getKeywords(); int count = 0; - for (final String element : list) { + for (final String element: list) { final String tag = element; if (!tag.equals("")) { prop.putXML("tags_"+count+"_tag", tag); @@ -100,7 +103,7 @@ public class getpageinfo_p { prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*"); } } - if (actions.indexOf("robots")>=0) { + if (actions.indexOf("robots") >= 0) { try { final DigestURI theURL = new DigestURI(url); @@ -110,13 +113,17 @@ public class getpageinfo_p { robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs()); } catch (final IOException e) { robotsEntry = null; + Log.logException(e); } prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1); + prop.putHTML("robotsInfo", robotsEntry.getInfo()); // get the sitemap URL of the domain final MultiProtocolURI sitemapURL = robotsEntry == null ? null : robotsEntry.getSitemap(); prop.putXML("sitemap", sitemapURL == null ? "" : sitemapURL.toString()); - } catch (final MalformedURLException e) {} + } catch (final MalformedURLException e) { + Log.logException(e); + } } } diff --git a/htroot/api/util/getpageinfo_p.xml b/htroot/api/getpageinfo_p.xml similarity index 92% rename from htroot/api/util/getpageinfo_p.xml rename to htroot/api/getpageinfo_p.xml index b9590c990..84da4eb97 100644 --- a/htroot/api/util/getpageinfo_p.xml +++ b/htroot/api/getpageinfo_p.xml @@ -4,6 +4,7 @@ #[desc]# #[lang]# #(robots-allowed)#0::1::#(/robots-allowed)# + #[robotsInfo]# #[sitemap]# #[favicon]# #[sitelist]# diff --git a/htroot/api/ymarks/get_metadata.java b/htroot/api/ymarks/get_metadata.java index 053c83668..702804e69 100644 --- a/htroot/api/ymarks/get_metadata.java +++ b/htroot/api/ymarks/get_metadata.java @@ -1,6 +1,8 @@ import java.io.IOException; import java.net.MalformedURLException; import java.util.EnumMap; +import java.util.Iterator; + import net.yacy.cora.protocol.RequestHeader; import net.yacy.document.Document; import net.yacy.document.Parser.Failure; @@ -8,6 +10,7 @@ import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.search.Switchboard; import de.anomic.data.UserDB; import de.anomic.data.ymark.YMarkAutoTagger; +import de.anomic.data.ymark.YMarkCrawlStart; import de.anomic.data.ymark.YMarkEntry; import de.anomic.data.ymark.YMarkMetadata; import de.anomic.data.ymark.YMarkTables; @@ -27,18 +30,43 @@ public class get_metadata { final boolean isAuthUser = user!= null && user.hasRight(UserDB.AccessRight.BOOKMARK_RIGHT); if(isAdmin || isAuthUser) { + final String bmk_user = (isAuthUser ? user.getUserName() : YMarkTables.USER_ADMIN); - try { - final String url = post.get(YMarkEntry.BOOKMARK.URL.key()); + + + String url = post.get(YMarkEntry.BOOKMARK.URL.key(),YMarkEntry.BOOKMARK.URL.deflt()); + boolean hasProtocol = false; + for (YMarkTables.PROTOCOLS p : YMarkTables.PROTOCOLS.values()) { + if(url.toLowerCase().startsWith(p.protocol())) { + hasProtocol = true; + break; + } + } + if (!hasProtocol) { + url=YMarkTables.PROTOCOLS.HTTP.protocol(url); + } + + try { YMarkMetadata meta = new YMarkMetadata(new DigestURI(url), sb.indexSegments); final Document document = meta.loadDocument(sb.loader); final EnumMap metadata = meta.loadMetadata(); prop.putXML("title", metadata.get(YMarkMetadata.METADATA.TITLE)); - prop.putXML("desc", metadata.get(YMarkMetadata.METADATA.DESCRIPTION)); - + prop.putXML("desc", metadata.get(YMarkMetadata.METADATA.DESCRIPTION)); prop.put("keywords", putTags(document.dc_subject(','), "keywords")); prop.put("autotags", putTags(YMarkAutoTagger.autoTag(document, 5, sb.tables.bookmarks.getTags(bmk_user)), "autotags")); + + final YMarkCrawlStart crawlStart = new YMarkCrawlStart(sb.tables, url); + final Iterator iter = crawlStart.keySet().iterator(); + int count = 0; + String key; + while(iter.hasNext()) { + key = iter.next(); + prop.putXML("crawlstart_"+count+"_key",key.toLowerCase()); + prop.putXML("crawlstart_"+count+"_value",crawlStart.get(key)); + count++; + } + prop.put("crawlstart", count); } catch (MalformedURLException e1) { // TODO Auto-generated catch block diff --git a/htroot/api/ymarks/get_metadata.xml b/htroot/api/ymarks/get_metadata.xml index 1cab22a98..adbb9a3b7 100644 --- a/htroot/api/ymarks/get_metadata.xml +++ b/htroot/api/ymarks/get_metadata.xml @@ -10,4 +10,9 @@ #{autotags}# #{/autotags}# + + \ No newline at end of file diff --git a/htroot/api/ymarks/get_treeview.java b/htroot/api/ymarks/get_treeview.java index 1bb453e4d..a44d97331 100644 --- a/htroot/api/ymarks/get_treeview.java +++ b/htroot/api/ymarks/get_treeview.java @@ -1,6 +1,5 @@ import java.io.IOException; import java.net.MalformedURLException; -import java.util.ArrayList; import java.util.Date; import java.util.EnumMap; import java.util.Iterator; @@ -9,18 +8,20 @@ import java.util.TreeMap; import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.document.UTF8; import net.yacy.cora.protocol.RequestHeader; +import net.yacy.document.Document; import net.yacy.document.Parser.Failure; import net.yacy.kelondro.blob.Tables; import net.yacy.kelondro.data.meta.DigestURI; -import net.yacy.kelondro.data.word.Word; import net.yacy.kelondro.index.RowSpaceExceededException; import net.yacy.kelondro.logging.Log; import net.yacy.search.Switchboard; import de.anomic.data.UserDB; +import de.anomic.data.ymark.YMarkAutoTagger; import de.anomic.data.ymark.YMarkCrawlStart; import de.anomic.data.ymark.YMarkEntry; import de.anomic.data.ymark.YMarkMetadata; import de.anomic.data.ymark.YMarkTables; +import de.anomic.data.ymark.YMarkTag; import de.anomic.data.ymark.YMarkUtil; import de.anomic.server.serverObjects; import de.anomic.server.serverSwitch; @@ -50,7 +51,7 @@ public class get_treeview { boolean isMetadata = false; boolean isURLdb = false; boolean isCrawlStart = false; - boolean isWordCount = false; + boolean isAutoTagger = false; boolean displayBmk = false; if (post != null){ @@ -73,7 +74,7 @@ public class get_treeview { isURLdb = true; isFolder = false; } else if (post.get(ROOT).startsWith("w:")) { - isWordCount = true; + isAutoTagger = true; isFolder = false; } else if (post.get(ROOT).startsWith("c:")) { isCrawlStart = true; @@ -192,7 +193,7 @@ public class get_treeview { prop.put("folders_"+count+"_hash", "c:"+url); prop.put("folders_"+count+"_hasChildren", "true"); count++; - prop.put("folders_"+count+"_foldername","WordCounts"); + prop.put("folders_"+count+"_foldername","AutoTagger"); putProp(count, "meta"); prop.put("folders_"+count+"_hash", "w:"+url); prop.put("folders_"+count+"_hasChildren", "true"); @@ -205,23 +206,21 @@ public class get_treeview { } catch (RowSpaceExceededException e) { Log.logException(e); } - } else if (isWordCount || isMetadata || isURLdb || isCrawlStart) { + } else if (isAutoTagger || isMetadata || isURLdb || isCrawlStart) { try { final YMarkMetadata meta = new YMarkMetadata(new DigestURI(post.get(ROOT).substring(2)), sb.indexSegments); - meta.loadDocument(sb.loader); - if(isWordCount) { - final TreeMap words = meta.getWordCounts(); - final ArrayList topwords = new ArrayList(words.descendingKeySet()); - for(int i = 0; i < 20 && i < topwords.size(); i++) { - String word = topwords.get(i); - int occur = words.get(word).occurrences(); - prop.put("folders_"+count+"_foldername",""+word+": [" + occur + "]"); - putProp(count, "meta"); - count++; - } - count--; - prop.put("folders_"+count+"_comma", ""); + final Document document = meta.loadDocument(sb.loader); + final TreeMap tags = sb.tables.bookmarks.getTags(bmk_user); + if(isAutoTagger) { + prop.put("folders_"+count+"_foldername","meta-"+YMarkMetadata.METADATA.KEYWORDS.name().toLowerCase()+": " + meta.loadMetadata().get(YMarkMetadata.METADATA.KEYWORDS) + ""); + putProp(count, "meta"); + count++; + prop.put("folders_"+count+"_foldername","with preference: "+YMarkAutoTagger.autoTag(document, 4, tags)+""); + putProp(count, "meta"); count++; + prop.put("folders_"+count+"_foldername","without preference: "+YMarkAutoTagger.autoTag(document, 4, new TreeMap())+""); + putProp(count, "meta"); + count++; prop.put("folders", count); } else if(isMetadata) { count = putMeta(count, meta.loadMetadata()); diff --git a/htroot/js/Bookmarks.js b/htroot/js/Bookmarks.js index dd46ed8c7..6f8f2d8ed 100644 --- a/htroot/js/Bookmarks.js +++ b/htroot/js/Bookmarks.js @@ -24,7 +24,7 @@ function loadTitle(){ url=document.getElementsByName("url")[0].value; if(document.getElementsByName("title")[0].value==""){ - sndReq('/api/util/getpageinfo_p.xml?actions=title&url='+url); + sndReq('/api/getpageinfo_p.xml?actions=title&url='+url); } } diff --git a/htroot/js/IndexCreate.js b/htroot/js/IndexCreate.js index 41f7f5378..e3c840fab 100644 --- a/htroot/js/IndexCreate.js +++ b/htroot/js/IndexCreate.js @@ -75,5 +75,7 @@ function loadInfos() { url=document.getElementById("crawlingURL").value; if (url.indexOf("ftp") == 0 || url.indexOf("smb") == 0) document.getElementById("crawlingQ").disabled=true; else document.getElementById("crawlingQ").disabled=false; - sndReq('/api/util/getpageinfo_p.xml?actions=title,robots&url='+url); + sndReq('/api/getpageinfo_p.xml?actions=title,robots&url='+url); + document.getElementById("api").innerHTML = "APISee the page info about the start url."; + } diff --git a/htroot/yacy/ui/yacyui-bookmarks.html b/htroot/yacy/ui/yacyui-bookmarks.html index 482158902..6e46bde39 100644 --- a/htroot/yacy/ui/yacyui-bookmarks.html +++ b/htroot/yacy/ui/yacyui-bookmarks.html @@ -85,7 +85,7 @@ var url = $("input[name='bm_url']").getValue(); $.ajax({ type: "GET", - url: "/api/util/getpageinfo_p.xml?url="+url, + url: "/api/getpageinfo_p.xml?url="+url, dataType: "xml", success: function(xml) { var title = $(xml).find('title').text(); diff --git a/htroot/yacy/ui/yacyui-search.html b/htroot/yacy/ui/yacyui-search.html index 636804bd6..e233dd700 100644 --- a/htroot/yacy/ui/yacyui-search.html +++ b/htroot/yacy/ui/yacyui-search.html @@ -162,7 +162,7 @@ function getTags(url, i) { $.ajax({ type: "GET", - url: "/api/util/getpageinfo_p.xml?url="+url, + url: "/api/getpageinfo_p.xml?url="+url, dataType: "xml", success: function(xml) { tags = ""; diff --git a/source/de/anomic/crawler/RobotsTxtEntry.java b/source/de/anomic/crawler/RobotsTxtEntry.java index 1b6636883..394f87802 100644 --- a/source/de/anomic/crawler/RobotsTxtEntry.java +++ b/source/de/anomic/crawler/RobotsTxtEntry.java @@ -1,4 +1,4 @@ -//RobotsEntry.java +//RobotsEntry.java //------------------------------------- //part of YACY //(C) by Michael Peter Christen; mc@yacy.net @@ -43,7 +43,7 @@ import net.yacy.kelondro.util.ByteArray; public class RobotsTxtEntry { - + private static final String HOST_NAME = "hostname"; private static final String ALLOW_PATH_LIST = "allow"; private static final String DISALLOW_PATH_LIST = "disallow"; @@ -54,16 +54,18 @@ public class RobotsTxtEntry { private static final String CRAWL_DELAY = "crawlDelay"; private static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis"; private static final String AGENT_NAME = "agentname"; - + // this is a simple record structure that holds all properties of a single crawl start private final Map mem; private final List allowPathList, denyPathList; private final String hostName, agentName; - + private String info; // this is filled if robots disallowed access; then the reason is noted there; + protected RobotsTxtEntry(final String hostName, final Map mem) { this.hostName = hostName.toLowerCase(); - this.mem = mem; - + this.mem = mem; + this.info = ""; + if (this.mem.containsKey(DISALLOW_PATH_LIST)) { this.denyPathList = new LinkedList(); final String csPl = UTF8.String(this.mem.get(DISALLOW_PATH_LIST)); @@ -89,12 +91,12 @@ public class RobotsTxtEntry { this.allowPathList = new LinkedList(); } this.agentName = this.mem.containsKey(AGENT_NAME) ? UTF8.String(this.mem.get(AGENT_NAME)) : null; - } - + } + protected RobotsTxtEntry( - final MultiProtocolURI theURL, - final List allowPathList, - final List disallowPathList, + final MultiProtocolURI theURL, + final List allowPathList, + final List disallowPathList, final Date loadedDate, final Date modDate, final String eTag, @@ -103,12 +105,12 @@ public class RobotsTxtEntry { final String agentName ) { if (theURL == null) throw new IllegalArgumentException("The url is missing"); - + this.hostName = RobotsTxt.getHostPort(theURL).toLowerCase(); this.allowPathList = new LinkedList(); this.denyPathList = new LinkedList(); this.agentName = agentName; - + this.mem = new LinkedHashMap(10); this.mem.put(HOST_NAME, UTF8.getBytes(this.hostName)); if (loadedDate != null) this.mem.put(LOADED_DATE, UTF8.getBytes(Long.toString(loadedDate.getTime()))); @@ -117,92 +119,92 @@ public class RobotsTxtEntry { if (sitemap != null) this.mem.put(SITEMAP, UTF8.getBytes(sitemap)); if (crawlDelayMillis > 0) this.mem.put(CRAWL_DELAY_MILLIS, UTF8.getBytes(Long.toString(crawlDelayMillis))); if (agentName != null) this.mem.put(AGENT_NAME, UTF8.getBytes(agentName)); - + if (allowPathList != null && !allowPathList.isEmpty()) { this.allowPathList.addAll(allowPathList); - + final StringBuilder pathListStr = new StringBuilder(allowPathList.size() * 30); - for (String element : allowPathList) { + for (final String element : allowPathList) { pathListStr.append(element) .append(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR); } this.mem.put(ALLOW_PATH_LIST, UTF8.getBytes(pathListStr.substring(0,pathListStr.length()-1))); } - + if (disallowPathList != null && !disallowPathList.isEmpty()) { this.denyPathList.addAll(disallowPathList); - + final StringBuilder pathListStr = new StringBuilder(disallowPathList.size() * 30); - for (String element : disallowPathList) { + for (final String element : disallowPathList) { pathListStr.append(element) .append(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR); } this.mem.put(DISALLOW_PATH_LIST, UTF8.getBytes(pathListStr.substring(0, pathListStr.length()-1))); } } - + protected String getHostName() { return this.hostName; } - + protected String getAgentName() { return this.agentName; } - + protected Map getMem() { if (!this.mem.containsKey(HOST_NAME)) this.mem.put(HOST_NAME, UTF8.getBytes(this.hostName)); return this.mem; } - + @Override public String toString() { final StringBuilder str = new StringBuilder(6000); str.append((this.hostName == null) ? "null" : this.hostName).append(": "); if (this.mem != null) str.append(this.mem.toString()); return str.toString(); - } - + } + /** * get the sitemap url * @return the sitemap url or null if no sitemap url is given */ public MultiProtocolURI getSitemap() { - String url = this.mem.containsKey(SITEMAP)? UTF8.String(this.mem.get(SITEMAP)): null; + final String url = this.mem.containsKey(SITEMAP)? UTF8.String(this.mem.get(SITEMAP)): null; if (url == null) return null; try { return new MultiProtocolURI(url); - } catch (MalformedURLException e) { + } catch (final MalformedURLException e) { return null; } } - + protected Date getLoadedDate() { if (this.mem.containsKey(LOADED_DATE)) { return new Date(ByteArray.parseDecimal(this.mem.get(LOADED_DATE))); } return null; } - + protected void setLoadedDate(final Date newLoadedDate) { if (newLoadedDate != null) { this.mem.put(LOADED_DATE, UTF8.getBytes(Long.toString(newLoadedDate.getTime()))); } } - + protected Date getModDate() { if (this.mem.containsKey(MOD_DATE)) { return new Date(ByteArray.parseDecimal(this.mem.get(MOD_DATE))); } return null; - } - + } + protected String getETag() { if (this.mem.containsKey(ETAG)) { return ASCII.String(this.mem.get(ETAG)); } return null; - } - + } + protected long getCrawlDelayMillis() { if (this.mem.containsKey(CRAWL_DELAY_MILLIS)) try { return ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY_MILLIS)); @@ -214,26 +216,38 @@ public class RobotsTxtEntry { } catch (final NumberFormatException e) { return 0; } - return 0; + return 0; } - - public boolean isDisallowed(MultiProtocolURI subpathURL) { + + public boolean isDisallowed(final MultiProtocolURI subpathURL) { String path = subpathURL.getFile(); - if ((this.mem == null) || (this.denyPathList.isEmpty())) return false; - + if (this.mem == null) { + this.info = "no robots file available"; + return false; + } + if (this.denyPathList.isEmpty()) { + this.info = "no entry in robots.txt"; + return false; + } + // if the path is null or empty we set it to / - if ((path == null) || (path.length() == 0)) path = "/"; + if (path == null || path.length() == 0) path = "/"; // escaping all occurences of ; because this char is used as special char in the Robots DB else path = RobotsTxt.ROBOTS_DB_PATH_SEPARATOR_MATCHER.matcher(path).replaceAll("%3B"); - - for (String element : this.denyPathList) { - + + for (final String element : this.denyPathList) { + // disallow rule if (path.startsWith(element)) { + this.info = "path '" + path + "' starts with '" + element + "' from deny path list"; return true; } } + this.info = "path '" + path + "' does not start with any element from deny path list"; return false; } + public String getInfo() { + return this.info; + } } \ No newline at end of file diff --git a/source/de/anomic/data/ymark/TablesRowComparator.java b/source/de/anomic/data/ymark/TablesRowComparator.java index fe4a8f829..570193273 100644 --- a/source/de/anomic/data/ymark/TablesRowComparator.java +++ b/source/de/anomic/data/ymark/TablesRowComparator.java @@ -6,9 +6,14 @@ import net.yacy.kelondro.blob.Tables; public class TablesRowComparator implements Comparator { private String sortname; + private boolean desc; - public TablesRowComparator(final String sortname) { + public TablesRowComparator(final String sortname, final String sortorder) { setSortName(sortname); + if(sortorder.equals("desc")) + this.desc = true; + else + this.desc = false; } public void setSortName(final String sortname) { @@ -20,7 +25,10 @@ public class TablesRowComparator implements Comparator { if(row0.containsKey(this.sortname) && row1.containsKey(this.sortname)) { String name1 = UTF8.String(row0.get(this.sortname)).toLowerCase(); String name2 = UTF8.String(row1.get(this.sortname)).toLowerCase(); - return name1.compareTo(name2); + if(desc) + return name2.compareTo(name1); + else + return name1.compareTo(name2); } } return 0; diff --git a/source/de/anomic/data/ymark/YMarkAutoTagger.java b/source/de/anomic/data/ymark/YMarkAutoTagger.java index 8012aee88..0261d7631 100644 --- a/source/de/anomic/data/ymark/YMarkAutoTagger.java +++ b/source/de/anomic/data/ymark/YMarkAutoTagger.java @@ -29,7 +29,8 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle public final static String SPACE = " "; public final static String POISON = ""; - public final static HashSet stopwords = new HashSet(Arrays.asList(".", "!", "?", "nbsp", "uuml", "ouml", "auml", "amp", "quot", "laquo", "raquo")); + public final static HashSet stopwords = new HashSet(Arrays.asList(".", "!", "?", "nbsp", "uuml", "ouml", "auml", "amp", "quot", "laquo", "raquo", + "and", "with", "the", "gt", "lt")); private final ArrayBlockingQueue bmkQueue; @@ -90,35 +91,40 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle // generate potential tags from document title, description and subject final int bufferSize = document.dc_title().length() + document.dc_description().length() + document.dc_subject(' ').length() + 32; final StringBuilder buffer = new StringBuilder(bufferSize); + final StringBuilder pwords = new StringBuilder(1000); buffer.append(document.dc_title().toLowerCase()); buffer.append(document.dc_description().toLowerCase()); buffer.append(document.dc_subject(' ').toLowerCase()); final Enumeration tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib); - int count = 0; + int score = 0; // get phrases final TreeMap phrases = getPhrases(document, 2); phrases.putAll(getPhrases(document, 3)); - phrases.putAll(getPhrases(document, 4)); final Iterator iter = phrases.keySet().iterator(); while(iter.hasNext()) { - count = 10; + score = 10; final String phrase = iter.next(); if(phrases.get(phrase).size() > 3 && phrases.get(phrase).size() < 10) { - count = phrases.get(phrase).size() * phrase.split(" ").length * 35; + score = phrases.get(phrase).size() * phrase.split(" ").length * 20; } if(isDigitSpace(phrase)) { - count = 10; + score = 10; } if(phrases.get(phrase).size() > 2 && buffer.indexOf(phrase) > 1) { - count = count * 10; - } - topwords.add(new YMarkTag(phrase, count)); + score = score * 10; + } + if (tags.containsKey(phrase)) { + score = score * 20; + } + topwords.add(new YMarkTag(phrase, score)); + pwords.append(phrase); + pwords.append(' '); } - + // loop through potential tag and rank them while(tokens.hasMoreElements()) { - count = 0; + score = 0; token = tokens.nextElement(); // check if the token appears in the text @@ -126,23 +132,27 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle final Word word = words.get(token.toString()); // token appears in text and matches an existing bookmark tag if (tags.containsKey(token.toString())) { - count = word.occurrences() * tags.get(token.toString()).size() * 200; + score = word.occurrences() * tags.get(token.toString()).size() * 200; } // token appears in text and has more than 3 characters else if (token.length()>3) { - count = word.occurrences() * 100; + score = word.occurrences() * 100; + } + // if token is already part of a phrase, reduce score + if(pwords.toString().indexOf(token.toString())>1) { + score = score / 3; } - topwords.add(new YMarkTag(token.toString(), count)); + topwords.add(new YMarkTag(token.toString(), score)); } } - count = 0; + score = 0; buffer.setLength(0); for(final YMarkTag tag : topwords) { - if(count < max) { + if(score < max) { if(tag.size() > 100) { buffer.append(tag.name()); buffer.append(YMarkUtil.TAGS_SEPARATOR); - count++; + score++; } } else { break; @@ -165,7 +175,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle while(tokens.hasMoreElements()) { token = tokens.nextElement(); - if(stopwords.contains(token.toString())) + if(stopwords.contains(token.toString()) || isDigitSpace(token.toString())) continue; // if we have a full phrase, delete the first token diff --git a/source/de/anomic/data/ymark/YMarkCrawlStart.java b/source/de/anomic/data/ymark/YMarkCrawlStart.java index 7aeb4aea0..8315798e4 100644 --- a/source/de/anomic/data/ymark/YMarkCrawlStart.java +++ b/source/de/anomic/data/ymark/YMarkCrawlStart.java @@ -52,11 +52,13 @@ public class YMarkCrawlStart extends HashMap{ public void load(final String url) { try { final StringBuilder buffer = new StringBuilder(500); - buffer.append("^.*crawlingURL=\\Q"); + //buffer.append("^.*crawlingURL=\\Q"); + buffer.append("^crawl start for \\Q"); buffer.append(url); buffer.append("\\E?.*"); final Pattern pattern = Pattern.compile(buffer.toString()); - final Iterator APIcalls = this.worktables.iterator(WorkTables.TABLE_API_NAME, WorkTables.TABLE_API_COL_URL, pattern); + //final Iterator APIcalls = this.worktables.iterator(WorkTables.TABLE_API_NAME, WorkTables.TABLE_API_COL_URL, pattern); + final Iterator APIcalls = this.worktables.iterator(WorkTables.TABLE_API_NAME, WorkTables.TABLE_API_COL_COMMENT, pattern); Tables.Row row = null; while(APIcalls.hasNext()) { row = APIcalls.next(); diff --git a/source/de/anomic/data/ymark/YMarkMetadata.java b/source/de/anomic/data/ymark/YMarkMetadata.java index 1d5e46aa4..1d9cb9707 100644 --- a/source/de/anomic/data/ymark/YMarkMetadata.java +++ b/source/de/anomic/data/ymark/YMarkMetadata.java @@ -29,19 +29,14 @@ package de.anomic.data.ymark; import java.io.IOException; import java.net.MalformedURLException; import java.util.EnumMap; -import java.util.Map; -import java.util.TreeMap; import net.yacy.cora.date.ISO8601Formatter; import net.yacy.cora.document.UTF8; import net.yacy.cora.services.federated.yacy.CacheStrategy; -import net.yacy.document.Condenser; import net.yacy.document.Document; -import net.yacy.document.LibraryProvider; import net.yacy.document.Parser.Failure; import net.yacy.kelondro.data.meta.DigestURI; import net.yacy.kelondro.data.meta.URIMetadataRow; -import net.yacy.kelondro.data.word.Word; import net.yacy.repository.LoaderDispatcher; import net.yacy.search.index.Segments; import de.anomic.crawler.retrieval.Response; @@ -141,18 +136,4 @@ public class YMarkMetadata { } return metadata; } - - public TreeMap getWordCounts() { - if (this.document != null) { - return sortWordCounts(new Condenser(this.document, true, true, LibraryProvider.dymLib).words()); - } - return new TreeMap(); - } - - public static TreeMap sortWordCounts(final Map unsorted_words) { - final TreeMap sorted_words = new TreeMap(new YMarkWordCountComparator(unsorted_words)); - sorted_words.putAll(unsorted_words); - return sorted_words; - } - } diff --git a/source/de/anomic/data/ymark/YMarkTables.java b/source/de/anomic/data/ymark/YMarkTables.java index 6f8282b4e..0b4afd59c 100644 --- a/source/de/anomic/data/ymark/YMarkTables.java +++ b/source/de/anomic/data/ymark/YMarkTables.java @@ -27,9 +27,11 @@ package de.anomic.data.ymark; import java.io.IOException; +import java.util.ArrayList; +import java.util.Collections; import java.util.HashSet; import java.util.Iterator; -import java.util.SortedSet; +import java.util.List; import java.util.TreeMap; import java.util.TreeSet; import java.util.regex.Pattern; @@ -214,17 +216,16 @@ public class YMarkTables { return this.worktables.iterator(bmk_table, YMarkEntry.BOOKMARK.TAGS.key(), p); } - public SortedSet orderBookmarksBy(final Iterator rowIterator, final String sortname, final String sortorder) { - final TreeSet sortTree = new TreeSet(new TablesRowComparator(sortname)); + public List orderBookmarksBy(final Iterator rowIterator, final String sortname, final String sortorder) { + final List sortList = new ArrayList(); Row row; while (rowIterator.hasNext()) { row = rowIterator.next(); if(row != null) - sortTree.add(row); + sortList.add(row); } - if(sortorder.equals("desc")) - return sortTree.descendingSet(); - return sortTree; + Collections.sort(sortList, new TablesRowComparator(sortname, sortorder)); + return sortList; } public void addTags(final String bmk_user, final String url, final String tagString, final boolean merge) throws IOException, RowSpaceExceededException { diff --git a/source/de/anomic/data/ymark/YMarkWordCountComparator.java b/source/de/anomic/data/ymark/YMarkWordCountComparator.java deleted file mode 100644 index 8ba178369..000000000 --- a/source/de/anomic/data/ymark/YMarkWordCountComparator.java +++ /dev/null @@ -1,53 +0,0 @@ -// YMarkWordCountComparator.java -// (C) 2011 by Stefan Förster, sof@gmx.de, Norderstedt, Germany -// first published 2010 on http://yacy.net -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA - -package de.anomic.data.ymark; - -import java.util.Comparator; -import java.util.Map; - -import net.yacy.kelondro.data.word.Word; - -public class YMarkWordCountComparator implements Comparator { - - private Map words; - - public YMarkWordCountComparator(final Map words) { - this.words = words; - } - - public int compare(final String k1, final String k2) { - final Word w1 = this.words.get(k1); - final Word w2 = this.words.get(k2); - - if(w1.occurrences() > w2.occurrences()) - return 1; - else if(w1.occurrences() < w2.occurrences()) - return -1; - else - return 0; - } -}