diff --git a/htroot/Bookmarks.html b/htroot/Bookmarks.html
index 9cb74d700..a580c0410 100644
--- a/htroot/Bookmarks.html
+++ b/htroot/Bookmarks.html
@@ -229,7 +229,7 @@ To see a list of all APIs, please visit the Delete
/
- Info
+ Info
#{/bookmarks}#
diff --git a/htroot/CrawlStartExpert_p.html b/htroot/CrawlStartExpert_p.html
index 944bd1405..5d0d55fcc 100644
--- a/htroot/CrawlStartExpert_p.html
+++ b/htroot/CrawlStartExpert_p.html
@@ -17,6 +17,9 @@
+
+
+
#%env/templates/header.template%#
#%env/templates/submenuIndexCreate.template%#
Expert Crawl Start
diff --git a/htroot/CrawlStartSite_p.html b/htroot/CrawlStartSite_p.html
index 70ab6f426..bdcfc89c7 100644
--- a/htroot/CrawlStartSite_p.html
+++ b/htroot/CrawlStartSite_p.html
@@ -17,6 +17,9 @@
+
+
+
#%env/templates/header.template%#
#%env/templates/submenuIndexCreate.template%#
Site Crawling
diff --git a/htroot/ViewFile.html b/htroot/ViewFile.html
index 0f4a511f7..6a60aaea8 100644
--- a/htroot/ViewFile.html
+++ b/htroot/ViewFile.html
@@ -7,6 +7,14 @@
+
+
+
+
+
+
See the page info about the url.
+
+
#(display)#
#%env/templates/simpleheader.template%#
::
diff --git a/htroot/api/util/getpageinfo_p.java b/htroot/api/getpageinfo_p.java
similarity index 85%
rename from htroot/api/util/getpageinfo_p.java
rename to htroot/api/getpageinfo_p.java
index 5a101c611..68c490807 100755
--- a/htroot/api/util/getpageinfo_p.java
+++ b/htroot/api/getpageinfo_p.java
@@ -8,6 +8,7 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.parser.html.ContentScraper;
import net.yacy.kelondro.data.meta.DigestURI;
+import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
import de.anomic.crawler.RobotsTxtEntry;
import de.anomic.server.serverObjects;
@@ -24,21 +25,23 @@ public class getpageinfo_p {
prop.put("desc", "");
prop.put("lang", "");
prop.put("robots-allowed", "3"); //unknown
+ prop.put("robotsInfo", ""); //unknown
prop.put("sitemap", "");
prop.put("favicon","");
prop.put("sitelist", "");
prop.put("filter", ".*");
// default actions
- String actions="title,robots";
+ String actions = "title,robots";
if (post != null && post.containsKey("url")) {
- if(post.containsKey("actions"))
+ if (post.containsKey("actions"))
actions=post.get("actions");
String url=post.get("url");
- if(url.toLowerCase().startsWith("ftp://")){
+ if (url.toLowerCase().startsWith("ftp://")) {
prop.put("robots-allowed", "1");
- prop.putXML("title", "FTP: "+url);
+ prop.put("robotsInfo", "ftp does not follow robots.txt");
+ prop.putXML("title", "FTP: " + url);
return prop;
} else if (!url.startsWith("http://") &&
!url.startsWith("https://") &&
@@ -47,18 +50,18 @@ public class getpageinfo_p {
!url.startsWith("file://")) {
url = "http://" + url;
}
- if (actions.indexOf("title")>=0) {
+ if (actions.indexOf("title") >= 0) {
DigestURI u = null;
try {
u = new DigestURI(url);
} catch (final MalformedURLException e) {
- // fail, do nothing
+ Log.logException(e);
}
ContentScraper scraper = null;
if (u != null) try {
scraper = sb.loader.parseResource(u, CacheStrategy.IFEXIST);
} catch (final IOException e) {
- // now thats a fail, do nothing
+ Log.logException(e);
}
if (scraper != null) {
// put the document title
@@ -68,9 +71,9 @@ public class getpageinfo_p {
prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString());
// put keywords
- final String list[]=scraper.getKeywords();
+ final String list[] = scraper.getKeywords();
int count = 0;
- for (final String element : list) {
+ for (final String element: list) {
final String tag = element;
if (!tag.equals("")) {
prop.putXML("tags_"+count+"_tag", tag);
@@ -100,7 +103,7 @@ public class getpageinfo_p {
prop.putXML("filter", filter.length() > 0 ? filter.substring(1) : ".*");
}
}
- if (actions.indexOf("robots")>=0) {
+ if (actions.indexOf("robots") >= 0) {
try {
final DigestURI theURL = new DigestURI(url);
@@ -110,13 +113,17 @@ public class getpageinfo_p {
robotsEntry = sb.robots.getEntry(theURL, sb.peers.myBotIDs());
} catch (final IOException e) {
robotsEntry = null;
+ Log.logException(e);
}
prop.put("robots-allowed", robotsEntry == null ? 1 : robotsEntry.isDisallowed(theURL) ? 0 : 1);
+ prop.putHTML("robotsInfo", robotsEntry.getInfo());
// get the sitemap URL of the domain
final MultiProtocolURI sitemapURL = robotsEntry == null ? null : robotsEntry.getSitemap();
prop.putXML("sitemap", sitemapURL == null ? "" : sitemapURL.toString());
- } catch (final MalformedURLException e) {}
+ } catch (final MalformedURLException e) {
+ Log.logException(e);
+ }
}
}
diff --git a/htroot/api/util/getpageinfo_p.xml b/htroot/api/getpageinfo_p.xml
similarity index 92%
rename from htroot/api/util/getpageinfo_p.xml
rename to htroot/api/getpageinfo_p.xml
index b9590c990..84da4eb97 100644
--- a/htroot/api/util/getpageinfo_p.xml
+++ b/htroot/api/getpageinfo_p.xml
@@ -4,6 +4,7 @@
#[desc]#
#[lang]#
#(robots-allowed)#0::1::#(/robots-allowed)#
+ #[robotsInfo]#
#[sitemap]#
#[favicon]#
#[sitelist]#
diff --git a/htroot/api/ymarks/get_metadata.java b/htroot/api/ymarks/get_metadata.java
index 053c83668..702804e69 100644
--- a/htroot/api/ymarks/get_metadata.java
+++ b/htroot/api/ymarks/get_metadata.java
@@ -1,6 +1,8 @@
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.EnumMap;
+import java.util.Iterator;
+
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.document.Document;
import net.yacy.document.Parser.Failure;
@@ -8,6 +10,7 @@ import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.search.Switchboard;
import de.anomic.data.UserDB;
import de.anomic.data.ymark.YMarkAutoTagger;
+import de.anomic.data.ymark.YMarkCrawlStart;
import de.anomic.data.ymark.YMarkEntry;
import de.anomic.data.ymark.YMarkMetadata;
import de.anomic.data.ymark.YMarkTables;
@@ -27,18 +30,43 @@ public class get_metadata {
final boolean isAuthUser = user!= null && user.hasRight(UserDB.AccessRight.BOOKMARK_RIGHT);
if(isAdmin || isAuthUser) {
+
final String bmk_user = (isAuthUser ? user.getUserName() : YMarkTables.USER_ADMIN);
- try {
- final String url = post.get(YMarkEntry.BOOKMARK.URL.key());
+
+
+ String url = post.get(YMarkEntry.BOOKMARK.URL.key(),YMarkEntry.BOOKMARK.URL.deflt());
+ boolean hasProtocol = false;
+ for (YMarkTables.PROTOCOLS p : YMarkTables.PROTOCOLS.values()) {
+ if(url.toLowerCase().startsWith(p.protocol())) {
+ hasProtocol = true;
+ break;
+ }
+ }
+ if (!hasProtocol) {
+ url=YMarkTables.PROTOCOLS.HTTP.protocol(url);
+ }
+
+ try {
YMarkMetadata meta = new YMarkMetadata(new DigestURI(url), sb.indexSegments);
final Document document = meta.loadDocument(sb.loader);
final EnumMap metadata = meta.loadMetadata();
prop.putXML("title", metadata.get(YMarkMetadata.METADATA.TITLE));
- prop.putXML("desc", metadata.get(YMarkMetadata.METADATA.DESCRIPTION));
-
+ prop.putXML("desc", metadata.get(YMarkMetadata.METADATA.DESCRIPTION));
prop.put("keywords", putTags(document.dc_subject(','), "keywords"));
prop.put("autotags", putTags(YMarkAutoTagger.autoTag(document, 5, sb.tables.bookmarks.getTags(bmk_user)), "autotags"));
+
+ final YMarkCrawlStart crawlStart = new YMarkCrawlStart(sb.tables, url);
+ final Iterator iter = crawlStart.keySet().iterator();
+ int count = 0;
+ String key;
+ while(iter.hasNext()) {
+ key = iter.next();
+ prop.putXML("crawlstart_"+count+"_key",key.toLowerCase());
+ prop.putXML("crawlstart_"+count+"_value",crawlStart.get(key));
+ count++;
+ }
+ prop.put("crawlstart", count);
} catch (MalformedURLException e1) {
// TODO Auto-generated catch block
diff --git a/htroot/api/ymarks/get_metadata.xml b/htroot/api/ymarks/get_metadata.xml
index 1cab22a98..adbb9a3b7 100644
--- a/htroot/api/ymarks/get_metadata.xml
+++ b/htroot/api/ymarks/get_metadata.xml
@@ -10,4 +10,9 @@
#{autotags}#
#{/autotags}#
+
+
\ No newline at end of file
diff --git a/htroot/api/ymarks/get_treeview.java b/htroot/api/ymarks/get_treeview.java
index 1bb453e4d..a44d97331 100644
--- a/htroot/api/ymarks/get_treeview.java
+++ b/htroot/api/ymarks/get_treeview.java
@@ -1,6 +1,5 @@
import java.io.IOException;
import java.net.MalformedURLException;
-import java.util.ArrayList;
import java.util.Date;
import java.util.EnumMap;
import java.util.Iterator;
@@ -9,18 +8,20 @@ import java.util.TreeMap;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.protocol.RequestHeader;
+import net.yacy.document.Document;
import net.yacy.document.Parser.Failure;
import net.yacy.kelondro.blob.Tables;
import net.yacy.kelondro.data.meta.DigestURI;
-import net.yacy.kelondro.data.word.Word;
import net.yacy.kelondro.index.RowSpaceExceededException;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
import de.anomic.data.UserDB;
+import de.anomic.data.ymark.YMarkAutoTagger;
import de.anomic.data.ymark.YMarkCrawlStart;
import de.anomic.data.ymark.YMarkEntry;
import de.anomic.data.ymark.YMarkMetadata;
import de.anomic.data.ymark.YMarkTables;
+import de.anomic.data.ymark.YMarkTag;
import de.anomic.data.ymark.YMarkUtil;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
@@ -50,7 +51,7 @@ public class get_treeview {
boolean isMetadata = false;
boolean isURLdb = false;
boolean isCrawlStart = false;
- boolean isWordCount = false;
+ boolean isAutoTagger = false;
boolean displayBmk = false;
if (post != null){
@@ -73,7 +74,7 @@ public class get_treeview {
isURLdb = true;
isFolder = false;
} else if (post.get(ROOT).startsWith("w:")) {
- isWordCount = true;
+ isAutoTagger = true;
isFolder = false;
} else if (post.get(ROOT).startsWith("c:")) {
isCrawlStart = true;
@@ -192,7 +193,7 @@ public class get_treeview {
prop.put("folders_"+count+"_hash", "c:"+url);
prop.put("folders_"+count+"_hasChildren", "true");
count++;
- prop.put("folders_"+count+"_foldername","WordCounts");
+ prop.put("folders_"+count+"_foldername","AutoTagger");
putProp(count, "meta");
prop.put("folders_"+count+"_hash", "w:"+url);
prop.put("folders_"+count+"_hasChildren", "true");
@@ -205,23 +206,21 @@ public class get_treeview {
} catch (RowSpaceExceededException e) {
Log.logException(e);
}
- } else if (isWordCount || isMetadata || isURLdb || isCrawlStart) {
+ } else if (isAutoTagger || isMetadata || isURLdb || isCrawlStart) {
try {
final YMarkMetadata meta = new YMarkMetadata(new DigestURI(post.get(ROOT).substring(2)), sb.indexSegments);
- meta.loadDocument(sb.loader);
- if(isWordCount) {
- final TreeMap words = meta.getWordCounts();
- final ArrayList topwords = new ArrayList(words.descendingKeySet());
- for(int i = 0; i < 20 && i < topwords.size(); i++) {
- String word = topwords.get(i);
- int occur = words.get(word).occurrences();
- prop.put("folders_"+count+"_foldername",""+word+": [" + occur + "]");
- putProp(count, "meta");
- count++;
- }
- count--;
- prop.put("folders_"+count+"_comma", "");
+ final Document document = meta.loadDocument(sb.loader);
+ final TreeMap tags = sb.tables.bookmarks.getTags(bmk_user);
+ if(isAutoTagger) {
+ prop.put("folders_"+count+"_foldername","meta-"+YMarkMetadata.METADATA.KEYWORDS.name().toLowerCase()+": " + meta.loadMetadata().get(YMarkMetadata.METADATA.KEYWORDS) + "");
+ putProp(count, "meta");
+ count++;
+ prop.put("folders_"+count+"_foldername","with preference: "+YMarkAutoTagger.autoTag(document, 4, tags)+"");
+ putProp(count, "meta");
count++;
+ prop.put("folders_"+count+"_foldername","without preference: "+YMarkAutoTagger.autoTag(document, 4, new TreeMap())+"");
+ putProp(count, "meta");
+ count++;
prop.put("folders", count);
} else if(isMetadata) {
count = putMeta(count, meta.loadMetadata());
diff --git a/htroot/js/Bookmarks.js b/htroot/js/Bookmarks.js
index dd46ed8c7..6f8f2d8ed 100644
--- a/htroot/js/Bookmarks.js
+++ b/htroot/js/Bookmarks.js
@@ -24,7 +24,7 @@ function loadTitle(){
url=document.getElementsByName("url")[0].value;
if(document.getElementsByName("title")[0].value==""){
- sndReq('/api/util/getpageinfo_p.xml?actions=title&url='+url);
+ sndReq('/api/getpageinfo_p.xml?actions=title&url='+url);
}
}
diff --git a/htroot/js/IndexCreate.js b/htroot/js/IndexCreate.js
index 41f7f5378..e3c840fab 100644
--- a/htroot/js/IndexCreate.js
+++ b/htroot/js/IndexCreate.js
@@ -75,5 +75,7 @@ function loadInfos() {
url=document.getElementById("crawlingURL").value;
if (url.indexOf("ftp") == 0 || url.indexOf("smb") == 0) document.getElementById("crawlingQ").disabled=true; else document.getElementById("crawlingQ").disabled=false;
- sndReq('/api/util/getpageinfo_p.xml?actions=title,robots&url='+url);
+ sndReq('/api/getpageinfo_p.xml?actions=title,robots&url='+url);
+ document.getElementById("api").innerHTML = "
See the page info about the start url.";
+
}
diff --git a/htroot/yacy/ui/yacyui-bookmarks.html b/htroot/yacy/ui/yacyui-bookmarks.html
index 482158902..6e46bde39 100644
--- a/htroot/yacy/ui/yacyui-bookmarks.html
+++ b/htroot/yacy/ui/yacyui-bookmarks.html
@@ -85,7 +85,7 @@
var url = $("input[name='bm_url']").getValue();
$.ajax({
type: "GET",
- url: "/api/util/getpageinfo_p.xml?url="+url,
+ url: "/api/getpageinfo_p.xml?url="+url,
dataType: "xml",
success: function(xml) {
var title = $(xml).find('title').text();
diff --git a/htroot/yacy/ui/yacyui-search.html b/htroot/yacy/ui/yacyui-search.html
index 636804bd6..e233dd700 100644
--- a/htroot/yacy/ui/yacyui-search.html
+++ b/htroot/yacy/ui/yacyui-search.html
@@ -162,7 +162,7 @@
function getTags(url, i) {
$.ajax({
type: "GET",
- url: "/api/util/getpageinfo_p.xml?url="+url,
+ url: "/api/getpageinfo_p.xml?url="+url,
dataType: "xml",
success: function(xml) {
tags = "";
diff --git a/source/de/anomic/crawler/RobotsTxtEntry.java b/source/de/anomic/crawler/RobotsTxtEntry.java
index 1b6636883..394f87802 100644
--- a/source/de/anomic/crawler/RobotsTxtEntry.java
+++ b/source/de/anomic/crawler/RobotsTxtEntry.java
@@ -1,4 +1,4 @@
-//RobotsEntry.java
+//RobotsEntry.java
//-------------------------------------
//part of YACY
//(C) by Michael Peter Christen; mc@yacy.net
@@ -43,7 +43,7 @@ import net.yacy.kelondro.util.ByteArray;
public class RobotsTxtEntry {
-
+
private static final String HOST_NAME = "hostname";
private static final String ALLOW_PATH_LIST = "allow";
private static final String DISALLOW_PATH_LIST = "disallow";
@@ -54,16 +54,18 @@ public class RobotsTxtEntry {
private static final String CRAWL_DELAY = "crawlDelay";
private static final String CRAWL_DELAY_MILLIS = "crawlDelayMillis";
private static final String AGENT_NAME = "agentname";
-
+
// this is a simple record structure that holds all properties of a single crawl start
private final Map mem;
private final List allowPathList, denyPathList;
private final String hostName, agentName;
-
+ private String info; // this is filled if robots disallowed access; then the reason is noted there;
+
protected RobotsTxtEntry(final String hostName, final Map mem) {
this.hostName = hostName.toLowerCase();
- this.mem = mem;
-
+ this.mem = mem;
+ this.info = "";
+
if (this.mem.containsKey(DISALLOW_PATH_LIST)) {
this.denyPathList = new LinkedList();
final String csPl = UTF8.String(this.mem.get(DISALLOW_PATH_LIST));
@@ -89,12 +91,12 @@ public class RobotsTxtEntry {
this.allowPathList = new LinkedList();
}
this.agentName = this.mem.containsKey(AGENT_NAME) ? UTF8.String(this.mem.get(AGENT_NAME)) : null;
- }
-
+ }
+
protected RobotsTxtEntry(
- final MultiProtocolURI theURL,
- final List allowPathList,
- final List disallowPathList,
+ final MultiProtocolURI theURL,
+ final List allowPathList,
+ final List disallowPathList,
final Date loadedDate,
final Date modDate,
final String eTag,
@@ -103,12 +105,12 @@ public class RobotsTxtEntry {
final String agentName
) {
if (theURL == null) throw new IllegalArgumentException("The url is missing");
-
+
this.hostName = RobotsTxt.getHostPort(theURL).toLowerCase();
this.allowPathList = new LinkedList();
this.denyPathList = new LinkedList();
this.agentName = agentName;
-
+
this.mem = new LinkedHashMap(10);
this.mem.put(HOST_NAME, UTF8.getBytes(this.hostName));
if (loadedDate != null) this.mem.put(LOADED_DATE, UTF8.getBytes(Long.toString(loadedDate.getTime())));
@@ -117,92 +119,92 @@ public class RobotsTxtEntry {
if (sitemap != null) this.mem.put(SITEMAP, UTF8.getBytes(sitemap));
if (crawlDelayMillis > 0) this.mem.put(CRAWL_DELAY_MILLIS, UTF8.getBytes(Long.toString(crawlDelayMillis)));
if (agentName != null) this.mem.put(AGENT_NAME, UTF8.getBytes(agentName));
-
+
if (allowPathList != null && !allowPathList.isEmpty()) {
this.allowPathList.addAll(allowPathList);
-
+
final StringBuilder pathListStr = new StringBuilder(allowPathList.size() * 30);
- for (String element : allowPathList) {
+ for (final String element : allowPathList) {
pathListStr.append(element)
.append(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR);
}
this.mem.put(ALLOW_PATH_LIST, UTF8.getBytes(pathListStr.substring(0,pathListStr.length()-1)));
}
-
+
if (disallowPathList != null && !disallowPathList.isEmpty()) {
this.denyPathList.addAll(disallowPathList);
-
+
final StringBuilder pathListStr = new StringBuilder(disallowPathList.size() * 30);
- for (String element : disallowPathList) {
+ for (final String element : disallowPathList) {
pathListStr.append(element)
.append(RobotsTxt.ROBOTS_DB_PATH_SEPARATOR);
}
this.mem.put(DISALLOW_PATH_LIST, UTF8.getBytes(pathListStr.substring(0, pathListStr.length()-1)));
}
}
-
+
protected String getHostName() {
return this.hostName;
}
-
+
protected String getAgentName() {
return this.agentName;
}
-
+
protected Map getMem() {
if (!this.mem.containsKey(HOST_NAME)) this.mem.put(HOST_NAME, UTF8.getBytes(this.hostName));
return this.mem;
}
-
+
@Override
public String toString() {
final StringBuilder str = new StringBuilder(6000);
str.append((this.hostName == null) ? "null" : this.hostName).append(": ");
if (this.mem != null) str.append(this.mem.toString());
return str.toString();
- }
-
+ }
+
/**
* get the sitemap url
* @return the sitemap url or null if no sitemap url is given
*/
public MultiProtocolURI getSitemap() {
- String url = this.mem.containsKey(SITEMAP)? UTF8.String(this.mem.get(SITEMAP)): null;
+ final String url = this.mem.containsKey(SITEMAP)? UTF8.String(this.mem.get(SITEMAP)): null;
if (url == null) return null;
try {
return new MultiProtocolURI(url);
- } catch (MalformedURLException e) {
+ } catch (final MalformedURLException e) {
return null;
}
}
-
+
protected Date getLoadedDate() {
if (this.mem.containsKey(LOADED_DATE)) {
return new Date(ByteArray.parseDecimal(this.mem.get(LOADED_DATE)));
}
return null;
}
-
+
protected void setLoadedDate(final Date newLoadedDate) {
if (newLoadedDate != null) {
this.mem.put(LOADED_DATE, UTF8.getBytes(Long.toString(newLoadedDate.getTime())));
}
}
-
+
protected Date getModDate() {
if (this.mem.containsKey(MOD_DATE)) {
return new Date(ByteArray.parseDecimal(this.mem.get(MOD_DATE)));
}
return null;
- }
-
+ }
+
protected String getETag() {
if (this.mem.containsKey(ETAG)) {
return ASCII.String(this.mem.get(ETAG));
}
return null;
- }
-
+ }
+
protected long getCrawlDelayMillis() {
if (this.mem.containsKey(CRAWL_DELAY_MILLIS)) try {
return ByteArray.parseDecimal(this.mem.get(CRAWL_DELAY_MILLIS));
@@ -214,26 +216,38 @@ public class RobotsTxtEntry {
} catch (final NumberFormatException e) {
return 0;
}
- return 0;
+ return 0;
}
-
- public boolean isDisallowed(MultiProtocolURI subpathURL) {
+
+ public boolean isDisallowed(final MultiProtocolURI subpathURL) {
String path = subpathURL.getFile();
- if ((this.mem == null) || (this.denyPathList.isEmpty())) return false;
-
+ if (this.mem == null) {
+ this.info = "no robots file available";
+ return false;
+ }
+ if (this.denyPathList.isEmpty()) {
+ this.info = "no entry in robots.txt";
+ return false;
+ }
+
// if the path is null or empty we set it to /
- if ((path == null) || (path.length() == 0)) path = "/";
+ if (path == null || path.length() == 0) path = "/";
// escaping all occurences of ; because this char is used as special char in the Robots DB
else path = RobotsTxt.ROBOTS_DB_PATH_SEPARATOR_MATCHER.matcher(path).replaceAll("%3B");
-
- for (String element : this.denyPathList) {
-
+
+ for (final String element : this.denyPathList) {
+
// disallow rule
if (path.startsWith(element)) {
+ this.info = "path '" + path + "' starts with '" + element + "' from deny path list";
return true;
}
}
+ this.info = "path '" + path + "' does not start with any element from deny path list";
return false;
}
+ public String getInfo() {
+ return this.info;
+ }
}
\ No newline at end of file
diff --git a/source/de/anomic/data/ymark/TablesRowComparator.java b/source/de/anomic/data/ymark/TablesRowComparator.java
index fe4a8f829..570193273 100644
--- a/source/de/anomic/data/ymark/TablesRowComparator.java
+++ b/source/de/anomic/data/ymark/TablesRowComparator.java
@@ -6,9 +6,14 @@ import net.yacy.kelondro.blob.Tables;
public class TablesRowComparator implements Comparator {
private String sortname;
+ private boolean desc;
- public TablesRowComparator(final String sortname) {
+ public TablesRowComparator(final String sortname, final String sortorder) {
setSortName(sortname);
+ if(sortorder.equals("desc"))
+ this.desc = true;
+ else
+ this.desc = false;
}
public void setSortName(final String sortname) {
@@ -20,7 +25,10 @@ public class TablesRowComparator implements Comparator {
if(row0.containsKey(this.sortname) && row1.containsKey(this.sortname)) {
String name1 = UTF8.String(row0.get(this.sortname)).toLowerCase();
String name2 = UTF8.String(row1.get(this.sortname)).toLowerCase();
- return name1.compareTo(name2);
+ if(desc)
+ return name2.compareTo(name1);
+ else
+ return name1.compareTo(name2);
}
}
return 0;
diff --git a/source/de/anomic/data/ymark/YMarkAutoTagger.java b/source/de/anomic/data/ymark/YMarkAutoTagger.java
index 8012aee88..0261d7631 100644
--- a/source/de/anomic/data/ymark/YMarkAutoTagger.java
+++ b/source/de/anomic/data/ymark/YMarkAutoTagger.java
@@ -29,7 +29,8 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
public final static String SPACE = " ";
public final static String POISON = "";
- public final static HashSet stopwords = new HashSet(Arrays.asList(".", "!", "?", "nbsp", "uuml", "ouml", "auml", "amp", "quot", "laquo", "raquo"));
+ public final static HashSet stopwords = new HashSet(Arrays.asList(".", "!", "?", "nbsp", "uuml", "ouml", "auml", "amp", "quot", "laquo", "raquo",
+ "and", "with", "the", "gt", "lt"));
private final ArrayBlockingQueue bmkQueue;
@@ -90,35 +91,40 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
// generate potential tags from document title, description and subject
final int bufferSize = document.dc_title().length() + document.dc_description().length() + document.dc_subject(' ').length() + 32;
final StringBuilder buffer = new StringBuilder(bufferSize);
+ final StringBuilder pwords = new StringBuilder(1000);
buffer.append(document.dc_title().toLowerCase());
buffer.append(document.dc_description().toLowerCase());
buffer.append(document.dc_subject(' ').toLowerCase());
final Enumeration tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib);
- int count = 0;
+ int score = 0;
// get phrases
final TreeMap phrases = getPhrases(document, 2);
phrases.putAll(getPhrases(document, 3));
- phrases.putAll(getPhrases(document, 4));
final Iterator iter = phrases.keySet().iterator();
while(iter.hasNext()) {
- count = 10;
+ score = 10;
final String phrase = iter.next();
if(phrases.get(phrase).size() > 3 && phrases.get(phrase).size() < 10) {
- count = phrases.get(phrase).size() * phrase.split(" ").length * 35;
+ score = phrases.get(phrase).size() * phrase.split(" ").length * 20;
}
if(isDigitSpace(phrase)) {
- count = 10;
+ score = 10;
}
if(phrases.get(phrase).size() > 2 && buffer.indexOf(phrase) > 1) {
- count = count * 10;
- }
- topwords.add(new YMarkTag(phrase, count));
+ score = score * 10;
+ }
+ if (tags.containsKey(phrase)) {
+ score = score * 20;
+ }
+ topwords.add(new YMarkTag(phrase, score));
+ pwords.append(phrase);
+ pwords.append(' ');
}
-
+
// loop through potential tag and rank them
while(tokens.hasMoreElements()) {
- count = 0;
+ score = 0;
token = tokens.nextElement();
// check if the token appears in the text
@@ -126,23 +132,27 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
final Word word = words.get(token.toString());
// token appears in text and matches an existing bookmark tag
if (tags.containsKey(token.toString())) {
- count = word.occurrences() * tags.get(token.toString()).size() * 200;
+ score = word.occurrences() * tags.get(token.toString()).size() * 200;
}
// token appears in text and has more than 3 characters
else if (token.length()>3) {
- count = word.occurrences() * 100;
+ score = word.occurrences() * 100;
+ }
+ // if token is already part of a phrase, reduce score
+ if(pwords.toString().indexOf(token.toString())>1) {
+ score = score / 3;
}
- topwords.add(new YMarkTag(token.toString(), count));
+ topwords.add(new YMarkTag(token.toString(), score));
}
}
- count = 0;
+ score = 0;
buffer.setLength(0);
for(final YMarkTag tag : topwords) {
- if(count < max) {
+ if(score < max) {
if(tag.size() > 100) {
buffer.append(tag.name());
buffer.append(YMarkUtil.TAGS_SEPARATOR);
- count++;
+ score++;
}
} else {
break;
@@ -165,7 +175,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
while(tokens.hasMoreElements()) {
token = tokens.nextElement();
- if(stopwords.contains(token.toString()))
+ if(stopwords.contains(token.toString()) || isDigitSpace(token.toString()))
continue;
// if we have a full phrase, delete the first token
diff --git a/source/de/anomic/data/ymark/YMarkCrawlStart.java b/source/de/anomic/data/ymark/YMarkCrawlStart.java
index 7aeb4aea0..8315798e4 100644
--- a/source/de/anomic/data/ymark/YMarkCrawlStart.java
+++ b/source/de/anomic/data/ymark/YMarkCrawlStart.java
@@ -52,11 +52,13 @@ public class YMarkCrawlStart extends HashMap{
public void load(final String url) {
try {
final StringBuilder buffer = new StringBuilder(500);
- buffer.append("^.*crawlingURL=\\Q");
+ //buffer.append("^.*crawlingURL=\\Q");
+ buffer.append("^crawl start for \\Q");
buffer.append(url);
buffer.append("\\E?.*");
final Pattern pattern = Pattern.compile(buffer.toString());
- final Iterator APIcalls = this.worktables.iterator(WorkTables.TABLE_API_NAME, WorkTables.TABLE_API_COL_URL, pattern);
+ //final Iterator APIcalls = this.worktables.iterator(WorkTables.TABLE_API_NAME, WorkTables.TABLE_API_COL_URL, pattern);
+ final Iterator APIcalls = this.worktables.iterator(WorkTables.TABLE_API_NAME, WorkTables.TABLE_API_COL_COMMENT, pattern);
Tables.Row row = null;
while(APIcalls.hasNext()) {
row = APIcalls.next();
diff --git a/source/de/anomic/data/ymark/YMarkMetadata.java b/source/de/anomic/data/ymark/YMarkMetadata.java
index 1d5e46aa4..1d9cb9707 100644
--- a/source/de/anomic/data/ymark/YMarkMetadata.java
+++ b/source/de/anomic/data/ymark/YMarkMetadata.java
@@ -29,19 +29,14 @@ package de.anomic.data.ymark;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.EnumMap;
-import java.util.Map;
-import java.util.TreeMap;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.services.federated.yacy.CacheStrategy;
-import net.yacy.document.Condenser;
import net.yacy.document.Document;
-import net.yacy.document.LibraryProvider;
import net.yacy.document.Parser.Failure;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.kelondro.data.meta.URIMetadataRow;
-import net.yacy.kelondro.data.word.Word;
import net.yacy.repository.LoaderDispatcher;
import net.yacy.search.index.Segments;
import de.anomic.crawler.retrieval.Response;
@@ -141,18 +136,4 @@ public class YMarkMetadata {
}
return metadata;
}
-
- public TreeMap getWordCounts() {
- if (this.document != null) {
- return sortWordCounts(new Condenser(this.document, true, true, LibraryProvider.dymLib).words());
- }
- return new TreeMap();
- }
-
- public static TreeMap sortWordCounts(final Map unsorted_words) {
- final TreeMap sorted_words = new TreeMap(new YMarkWordCountComparator(unsorted_words));
- sorted_words.putAll(unsorted_words);
- return sorted_words;
- }
-
}
diff --git a/source/de/anomic/data/ymark/YMarkTables.java b/source/de/anomic/data/ymark/YMarkTables.java
index 6f8282b4e..0b4afd59c 100644
--- a/source/de/anomic/data/ymark/YMarkTables.java
+++ b/source/de/anomic/data/ymark/YMarkTables.java
@@ -27,9 +27,11 @@
package de.anomic.data.ymark;
import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Collections;
import java.util.HashSet;
import java.util.Iterator;
-import java.util.SortedSet;
+import java.util.List;
import java.util.TreeMap;
import java.util.TreeSet;
import java.util.regex.Pattern;
@@ -214,17 +216,16 @@ public class YMarkTables {
return this.worktables.iterator(bmk_table, YMarkEntry.BOOKMARK.TAGS.key(), p);
}
- public SortedSet orderBookmarksBy(final Iterator rowIterator, final String sortname, final String sortorder) {
- final TreeSet sortTree = new TreeSet(new TablesRowComparator(sortname));
+ public List orderBookmarksBy(final Iterator rowIterator, final String sortname, final String sortorder) {
+ final List sortList = new ArrayList();
Row row;
while (rowIterator.hasNext()) {
row = rowIterator.next();
if(row != null)
- sortTree.add(row);
+ sortList.add(row);
}
- if(sortorder.equals("desc"))
- return sortTree.descendingSet();
- return sortTree;
+ Collections.sort(sortList, new TablesRowComparator(sortname, sortorder));
+ return sortList;
}
public void addTags(final String bmk_user, final String url, final String tagString, final boolean merge) throws IOException, RowSpaceExceededException {
diff --git a/source/de/anomic/data/ymark/YMarkWordCountComparator.java b/source/de/anomic/data/ymark/YMarkWordCountComparator.java
deleted file mode 100644
index 8ba178369..000000000
--- a/source/de/anomic/data/ymark/YMarkWordCountComparator.java
+++ /dev/null
@@ -1,53 +0,0 @@
-// YMarkWordCountComparator.java
-// (C) 2011 by Stefan Förster, sof@gmx.de, Norderstedt, Germany
-// first published 2010 on http://yacy.net
-//
-// This is a part of YaCy, a peer-to-peer based web search engine
-//
-// $LastChangedDate$
-// $LastChangedRevision$
-// $LastChangedBy$
-//
-// LICENSE
-//
-// This program is free software; you can redistribute it and/or modify
-// it under the terms of the GNU General Public License as published by
-// the Free Software Foundation; either version 2 of the License, or
-// (at your option) any later version.
-//
-// This program is distributed in the hope that it will be useful,
-// but WITHOUT ANY WARRANTY; without even the implied warranty of
-// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
-// GNU General Public License for more details.
-//
-// You should have received a copy of the GNU General Public License
-// along with this program; if not, write to the Free Software
-// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
-
-package de.anomic.data.ymark;
-
-import java.util.Comparator;
-import java.util.Map;
-
-import net.yacy.kelondro.data.word.Word;
-
-public class YMarkWordCountComparator implements Comparator {
-
- private Map words;
-
- public YMarkWordCountComparator(final Map words) {
- this.words = words;
- }
-
- public int compare(final String k1, final String k2) {
- final Word w1 = this.words.get(k1);
- final Word w2 = this.words.get(k2);
-
- if(w1.occurrences() > w2.occurrences())
- return 1;
- else if(w1.occurrences() < w2.occurrences())
- return -1;
- else
- return 0;
- }
-}