- some refactoring (ymarks)

- improvement for autotagger (is now able to create/detect  multi word tags e.g. 'open source')



git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@8031 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
apfelmaennchen 14 years ago
parent 2f03186252
commit 5f7dbe1c42

@ -3,263 +3,24 @@
<head> <head>
<title>YaCy Bookmarks</title> <title>YaCy Bookmarks</title>
#%env/templates/metas.template%# #%env/templates/metas.template%#
<link media="screen" type="text/css" href="/env/yacy-ymarks.css" rel="stylesheet" />
<link media="screen" type="text/css" href="/yacy/ui/css/jquery.flexigrid.css" rel="stylesheet" /> <link media="screen" type="text/css" href="/yacy/ui/css/jquery.flexigrid.css" rel="stylesheet" />
<link media="screen" type="text/css" href="/yacy/ui/css/jquery.treeview.css" rel="stylesheet" /> <link media="screen" type="text/css" href="/yacy/ui/css/jquery.treeview.css" rel="stylesheet" />
<link media="screen" type="text/css" href="/yacy/ui/css/jquery.multiselect.css" rel="stylesheet" /> <link media="screen" type="text/css" href="/yacy/ui/css/jquery.multiselect.css" rel="stylesheet" />
<link media="screen" type="text/css" href="/env/yacy-ymarks.css" rel="stylesheet" />
</head>
<body id="ymarks_body">
#%env/templates/header.template%#
<script src="/yacy/ui/js/jquery-flexigrid.js" type="text/javascript"></script> <script src="/yacy/ui/js/jquery-flexigrid.js" type="text/javascript"></script>
<script src="/yacy/ui/js/jquery.treeview.min.js" type="text/javascript"></script> <script src="/yacy/ui/js/jquery.treeview.min.js" type="text/javascript"></script>
<script src="/yacy/ui/js/jquery.treeview.async.js" type="text/javascript"></script> <script src="/yacy/ui/js/jquery.treeview.async.js" type="text/javascript"></script>
<script src="/yacy/ui/js/jquery.multiselect.min.js" type="text/javascript"></script> <script src="/yacy/ui/js/jquery.multiselect.min.js" type="text/javascript"></script>
<script src="/js/yacy-ymarks.js" type="text/javascript"></script>
<script src="/js/yacy-ymarks-bookmark-actions.js" type="text/javascript"></script>
<script src="/js/yacy-ymarks-tag-actions.js" type="text/javascript"></script>
</head>
<body id="ymarks_body">
#%env/templates/header.template%#
<script type="text/javascript">
//<![CDATA[
HTMLenc = function(s) {
return $('<div/>').text(s).html();
}
$(document).ready(function() {
var height=document.documentElement.clientHeight - 200;
/* Initialize Bookmark Dialog */
$("#ymarks_add_dialog").dialog({
autoOpen: false,
height: 420,
width: 340,
position: ['top',100],
modal: true,
resizable: false,
buttons: {
OK: function() {
var url = $("input[name='bm_url']").getValue();
var title = $("input[name='bm_title']").getValue();
var desc = $("textarea[name='bm_desc']").getValue();
var tags = $("input[name='bm_tags']").getValue()
var path = $("input[name='bm_path']").getValue();
var pub = $("select[name='bm_public']").getValue();
$.ajax({
type: "POST",
url: "/api/ymarks/add_ymark.xml",
data: "url="+url+"&title="+title+"&desc="+desc+"&tags="+tags+"&folders="+path+"&public="+pub,
dataType: "xml",
success: function(xml) {
$('#bmaddform').resetForm();
$("#bm_url").unbind('blur');
$("#ymarks_add_dialog").dialog("close");
$('#ymarks_flexigrid').flexReload();
return false;
}
});
} ,
Cancel: function() { $("#ymarks_add_dialog").dialog("close"); }
}
});
/* Initialize Flexigrid */
$('#ymarks_flexigrid').flexigrid({
url: '/api/ymarks/get_ymark.json',
dataType: 'json',
method: 'GET',
colModel: [
{display: 'Hash', name : 'hash', width : 50, sortable : false, align: 'center', hide: true},
{display: 'Public', name : 'public', width : 25, sortable : true, align: 'center'},
{display: 'Title', name : 'title', width : 400, sortable : true, align: 'left'},
{display: 'Tags', name : 'tags', width : 160, sortable : false, align: 'left'},
{display: 'Folders', name : 'folders', width : 160, sortable : true, align: 'left', hide: true},
{display: 'Date added', name : 'date_added', width : 100, sortable : true, align: 'left'},
{display: 'Date visited', name : 'date_visited', width : 100, sortable : true, align: 'left'}
],
buttons: [
{name: '...', bclass: 'burst', onpress: function() {
$('#ymarks_flexigrid').flexOptions({
sortname: "title",
sortorder: "asc",
query: ".*",
qtype: "title"
});
$('#ymarks_flexigrid').flexReload();
}},
{separator: true},
{name: 'Add', bclass: 'bookmark', onpress: bm_action},
{name: 'Edit', bclass: 'edit', onpress: bm_action},
{name: 'Delete', bclass: 'delete', onpress: bm_action},
{separator: true},
{name: 'Crawl', bclass: 'crawl', onpress: bm_action},
{separator: true},
{name: 'Add', bclass: 'addTag', onpress: tag_action},
{name: 'Rename', bclass: 'editTag', onpress: tag_action},
{separator: true},
{name: 'Help', bclass: 'help', onpress: bm_action}
],
searchitems : [
{display: 'Full text (regexp)', name : ''},
{display: 'Tags (comma seperated)', name : '_tags'},
{display: 'Tags (regexp)', name : 'tags'},
{display: 'Singel Folder', name : '_folder'},
{display: 'Folders (regexp)', name : 'folders'},
{display: 'Title (regexp)', name : 'title'},
{display: 'Description (regexp)', name : 'desc'}
],
useRp: true,
rp: 15,
sortname: "title",
sortorder: "asc",
usepager: true,
striped: true,
nowrap: false,
height: height,
query: ".*",
qtype: "title"
});
/* Initialize Sidebar */
$('#ymarks_sidebar').height(height+90);
$tabs = $('#ymarks_sidebar').tabs({
// tabs options
});
$tabs.bind('tabsselect', function(event, ui) {
/*
Objects available in the function context:
ui.tab - anchor element of the selected (clicked) tab
ui.panel - element, that contains the selected/clicked tab contents
ui.index - zero-based index of the selected (clicked) tab
*/
tabid = "#"+ui.panel.id;
if (tabid == "#ymarks_tags_tab") {
loadTagCloud();
}
return true;
});
$("#ymarks_treeview").treeview({
url: "/api/ymarks/get_treeview.json?bmtype=href",
unique: true,
persist: "location"
});
$("#ymarks_treeview").bind("click", function(event) {
if ($(event.target).is("li") || $(event.target).parents("li").length) {
var folder = $(event.target).parents("li").filter(":first").attr("id");
$('#ymarks_flexigrid').flexOptions({
query: folder,
qtype: "_folder",
newp: 1
});
$('#ymarks_flexigrid').flexReload();
return false;
}
});
$("#example").multiselect();
});
function loadTagCloud() {
$("#ymarks_tagcloud *").remove();
$.ajax({
type: "POST",
url: "/api/ymarks/get_tags.xml?top=25&sort=alpha",
dataType: "xml",
cache: false,
success: function(xml) {
$(xml).find('tag').each(function(){
var count = $(this).attr('count');
var tag = $(this).attr('tag');
var size = ((count/20)+0.3);
if (size < 1) {size = 1;}
$('<a style="font-size:'+size+'em"></a>')
.html(HTMLenc(tag)+' ')
.appendTo('#ymarks_tagcloud')
.bind('click', function() {
var qtag = $(this).text().replace(/\s+$/g,"");
$('#ymarks_flexigrid').flexOptions({
query: qtag,
qtype: "_tags",
newp: 1
});
$('#ymarks_flexigrid').flexReload();
});
}); //close each(
}
}); //close $.ajax(
};
/* Initialize Bookmark Actions */
function bm_action(com,grid) {
if (com=='Delete') {
var check = confirm('Delete ' + $('.trSelected',grid).length + ' bookmark(s)?');
if(check == true) {
$('.trSelected',grid).each(function(){
var url = "/api/ymarks/delete_ymark.xml?id="+$(this).find('td :first').text();
$.ajax({
type: 'POST',
url: url,
dataType: 'xml',
success: function(xml) {
$('#ymarks_flexigrid').flexReload();
}
}); // close $.ajax(
}); //close each(
}
}
else if (com=='Add') {
$('#bmaddform').resetForm();
$("#bm_url").blur(function() {
var url = $("input[name='bm_url']").getValue();
$.ajax({
type: "GET",
url: "/api/util/getpageinfo_p.xml?url="+url,
dataType: "xml",
success: function(xml) {
var title = $(xml).find('title').text();
$("input[name='bm_title']").setValue(title);
var desc = $(xml).find('desc').text();
$("textarea[name='bm_desc']").setValue(desc);
tags = "";
$(xml).find('tag').each(function(){
tags = tags + "," + $(this).attr('name');
});
$("input[name='bm_tags']").setValue(tags);
}
});
});
$("#ymarks_add_dialog").dialog('open');
}
else if (com=='Edit') {
if ($('.trSelected',grid).length > 1) {
alert("Editing of more than one selected bookmark is currently not supportet!");
return false;
}
$("input[name='bm_url']").setValue($('.trSelected',grid).find('.url').text());
$("input[name='bm_title']").setValue($('.trSelected',grid).find('h3.linktitle').text().trim());
$("textarea[name='bm_desc']").setValue($('.trSelected',grid).find('p.desc').text().trim());
$("input[name='bm_tags']").setValue($('.trSelected',grid).find('p.tags').text().trim().replace(/,\s/g,","));
$("input[name='bm_path']").setValue($('.trSelected',grid).find('p.folders').text().replace(/,\s/g,","));
$("select[name='bm_public']").setValue($('.trSelected',grid).find('img').attr('alt'));
$("#ymarks_add_dialog").dialog('open');
}
}
/* Initialize Tag Actions */
function tag_action(com,grid) {
if (com=='Add') {
flex = grid;
$('#tagaddform').resetForm();
$("#tagadd").dialog('open');
} else {
$('#tageditform').resetForm();
$("#tagedit").dialog('open');
}
};
//]]>
</script>
<div class="SubMenu"> <div class="SubMenu">
<h3>Bookmarks</h3> <h3>Bookmarks</h3>
<!-- <!--

@ -0,0 +1,72 @@
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.EnumMap;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.document.Document;
import net.yacy.document.Parser.Failure;
import net.yacy.kelondro.data.meta.DigestURI;
import net.yacy.search.Switchboard;
import de.anomic.data.UserDB;
import de.anomic.data.ymark.YMarkAutoTagger;
import de.anomic.data.ymark.YMarkEntry;
import de.anomic.data.ymark.YMarkMetadata;
import de.anomic.data.ymark.YMarkTables;
import de.anomic.data.ymark.YMarkUtil;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class get_metadata {
static serverObjects prop;
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {
final Switchboard sb = (Switchboard) env;
prop = new serverObjects();
final UserDB.Entry user = sb.userDB.getUser(header);
final boolean isAdmin = (sb.verifyAuthentication(header, true));
final boolean isAuthUser = user!= null && user.hasRight(UserDB.AccessRight.BOOKMARK_RIGHT);
if(isAdmin || isAuthUser) {
final String bmk_user = (isAuthUser ? user.getUserName() : YMarkTables.USER_ADMIN);
try {
final String url = post.get(YMarkEntry.BOOKMARK.URL.key());
YMarkMetadata meta = new YMarkMetadata(new DigestURI(url), sb.indexSegments);
final Document document = meta.loadDocument(sb.loader);
final EnumMap<YMarkMetadata.METADATA, String> metadata = meta.loadMetadata();
prop.putXML("title", metadata.get(YMarkMetadata.METADATA.TITLE));
prop.putXML("desc", metadata.get(YMarkMetadata.METADATA.DESCRIPTION));
prop.put("keywords", putTags(document.dc_subject(','), "keywords"));
prop.put("autotags", putTags(YMarkAutoTagger.autoTag(document, 5, sb.tables.bookmarks.getTags(bmk_user)), "autotags"));
} catch (MalformedURLException e1) {
// TODO Auto-generated catch block
e1.printStackTrace();
} catch (IOException e) {
// TODO Auto-generated catch block
e.printStackTrace();
} catch (Failure e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
} else {
prop.put(YMarkTables.USER_AUTHENTICATE,YMarkTables.USER_AUTHENTICATE_MSG);
}
// return rewrite properties
return prop;
}
public static int putTags(final String tagString, final String var) {
final String list[] = tagString.split(YMarkUtil.TAGS_SEPARATOR);
int count = 0;
for (final String element : list) {
final String tag = element;
if (!tag.equals("")) {
prop.putXML(var+"_"+count+"_tag", tag);
count++;
}
}
return count;
}
}

@ -0,0 +1,13 @@
<?xml version='1.0' encoding="UTF-8" standalone='yes'?>
<info>
<title>#[title]#</title>
<desc>#[desc]#</desc>
<keywords>
#{keywords}#<tag name="#[tag]#" />
#{/keywords}#
</keywords>
<autotags>
#{autotags}#<tag name="#[tag]#" />
#{/autotags}#
</autotags>
</info>

@ -0,0 +1,98 @@
/* Initialize Bookmark Actions */
function bm_action(com,grid) {
if (com=='Delete') {
var check = confirm('Delete ' + $('.trSelected',grid).length + ' bookmark(s)?');
if(check == true) {
$('.trSelected',grid).each(function(){
var url = "/api/ymarks/delete_ymark.xml?id="+$(this).find('td :first').text();
$.ajax({
type: 'POST',
url: url,
dataType: 'xml',
success: function(xml) {
$('#ymarks_flexigrid').flexReload();
}
}); // close $.ajax(
}); //close each(
}
}
else if (com=='Add') {
$('#bmaddform').resetForm();
$("#bm_url").blur(function() {
var url = $("input[name='bm_url']").getValue();
$.ajax({
type: "GET",
url: "/api/ymarks/get_metadata.xml?url="+url,
dataType: "xml",
success: function(xml) {
var title = $(xml).find('title').text();
$("input[name='bm_title']").setValue(title);
var desc = $(xml).find('desc').text();
$("textarea[name='bm_desc']").setValue(desc);
var autotags = $(xml).find('autotags')
var tags = "";
$(autotags).find('tag').each(function(){
tags = tags + "," + $(this).attr('name');
});
$("input[name='bm_tags']").setValue(tags);
}
});
});
$("#ymarks_add_dialog").dialog('open');
}
else if (com=='Edit') {
if ($('.trSelected',grid).length > 1) {
alert("Editing of more than one selected bookmark is currently not supportet!");
return false;
}
$("input[name='bm_url']").setValue($('.trSelected',grid).find('.url').text());
$("input[name='bm_title']").setValue($('.trSelected',grid).find('h3.linktitle').text().trim());
$("textarea[name='bm_desc']").setValue($('.trSelected',grid).find('p.desc').text().trim());
$("input[name='bm_tags']").setValue($('.trSelected',grid).find('p.tags').text().trim().replace(/,\s/g,","));
$("input[name='bm_path']").setValue($('.trSelected',grid).find('p.folders').text().replace(/,\s/g,","));
$("select[name='bm_public']").setValue($('.trSelected',grid).find('img').attr('alt'));
$("#ymarks_add_dialog").dialog('open');
}
else {
alert("Sorry, the function you have requested is not yet available!");
return false;
}
}
function bm_dialog() {
/* Initialize Bookmark Dialog */
$("#ymarks_add_dialog").dialog({
autoOpen: false,
height: 420,
width: 340,
position: ['top',100],
modal: true,
resizable: false,
buttons: {
OK: function() {
var url = $("input[name='bm_url']").getValue();
var title = $("input[name='bm_title']").getValue();
var desc = $("textarea[name='bm_desc']").getValue();
var tags = $("input[name='bm_tags']").getValue()
var path = $("input[name='bm_path']").getValue();
var pub = $("select[name='bm_public']").getValue();
$.ajax({
type: "POST",
url: "/api/ymarks/add_ymark.xml",
data: "url="+url+"&title="+title+"&desc="+desc+"&tags="+tags+"&folders="+path+"&public="+pub,
dataType: "xml",
success: function(xml) {
$('#bmaddform').resetForm();
$("#bm_url").unbind('blur');
$("#ymarks_add_dialog").dialog("close");
$('#ymarks_flexigrid').flexReload();
return false;
}
});
} ,
Cancel: function() { $("#ymarks_add_dialog").dialog("close"); }
}
});
}

@ -0,0 +1,11 @@
/* Initialize Tag Actions */
function tag_action(com,grid) {
if (com=='Add') {
flex = grid;
$('#tagaddform').resetForm();
$("#tagadd").dialog('open');
} else {
$('#tageditform').resetForm();
$("#tagedit").dialog('open');
}
};

@ -0,0 +1,139 @@
HTMLenc = function(s) {
return $('<div/>').text(s).html();
}
$(document).ready(function() {
var height=document.documentElement.clientHeight - 200;
/* Initialize Bookmark Dialog */
bm_dialog();
/* Initialize Flexigrid */
$('#ymarks_flexigrid').flexigrid({
url: '/api/ymarks/get_ymark.json',
dataType: 'json',
method: 'GET',
colModel: [
{display: 'Hash', name : 'hash', width : 50, sortable : false, align: 'center', hide: true},
{display: 'Public', name : 'public', width : 25, sortable : true, align: 'center'},
{display: 'Title', name : 'title', width : 400, sortable : true, align: 'left'},
{display: 'Tags', name : 'tags', width : 160, sortable : false, align: 'left'},
{display: 'Folders', name : 'folders', width : 160, sortable : true, align: 'left', hide: true},
{display: 'Date added', name : 'date_added', width : 100, sortable : true, align: 'left'},
{display: 'Date visited', name : 'date_visited', width : 100, sortable : true, align: 'left'}
],
buttons: [
{name: '...', bclass: 'burst', onpress: function() {
$('#ymarks_flexigrid').flexOptions({
sortname: "title",
sortorder: "asc",
query: ".*",
qtype: "title"
});
$('#ymarks_flexigrid').flexReload();
}},
{separator: true},
{name: 'Add', bclass: 'bookmark', onpress: bm_action},
{name: 'Edit', bclass: 'edit', onpress: bm_action},
{name: 'Delete', bclass: 'delete', onpress: bm_action},
{separator: true},
{name: 'Crawl', bclass: 'crawl', onpress: bm_action},
{separator: true},
{name: 'Add', bclass: 'addTag', onpress: tag_action},
{name: 'Rename', bclass: 'editTag', onpress: tag_action},
{separator: true},
{name: 'Help', bclass: 'help', onpress: bm_action}
],
searchitems : [
{display: 'Full text (regexp)', name : ''},
{display: 'Tags (comma seperated)', name : '_tags'},
{display: 'Tags (regexp)', name : 'tags'},
{display: 'Singel Folder', name : '_folder'},
{display: 'Folders (regexp)', name : 'folders'},
{display: 'Title (regexp)', name : 'title'},
{display: 'Description (regexp)', name : 'desc'}
],
useRp: true,
rp: 15,
sortname: "title",
sortorder: "asc",
usepager: true,
striped: true,
nowrap: false,
height: height,
query: ".*",
qtype: "title"
});
/* Initialize Sidebar */
$('#ymarks_sidebar').height(height+90);
$tabs = $('#ymarks_sidebar').tabs({
// tabs options
});
$tabs.bind('tabsselect', function(event, ui) {
/*
Objects available in the function context:
ui.tab - anchor element of the selected (clicked) tab
ui.panel - element, that contains the selected/clicked tab contents
ui.index - zero-based index of the selected (clicked) tab
*/
tabid = "#"+ui.panel.id;
if (tabid == "#ymarks_tags_tab") {
loadTagCloud();
}
return true;
});
$("#ymarks_treeview").treeview({
url: "/api/ymarks/get_treeview.json?bmtype=href",
unique: true,
persist: "location"
});
$("#ymarks_treeview").bind("click", function(event) {
if ($(event.target).is("li") || $(event.target).parents("li").length) {
var folder = $(event.target).parents("li").filter(":first").attr("id");
$('#ymarks_flexigrid').flexOptions({
query: folder,
qtype: "_folder",
newp: 1
});
$('#ymarks_flexigrid').flexReload();
return false;
}
});
$("#example").multiselect();
});
function loadTagCloud() {
$("#ymarks_tagcloud *").remove();
$.ajax({
type: "POST",
url: "/api/ymarks/get_tags.xml?top=25&sort=alpha",
dataType: "xml",
cache: false,
success: function(xml) {
$(xml).find('tag').each(function(){
var count = $(this).attr('count');
var tag = $(this).attr('tag');
var size = ((count/20)+0.3);
if (size < 1) {size = 1;}
$('<a style="font-size:'+size+'em"></a>')
.html(HTMLenc(tag)+' ')
.appendTo('#ymarks_tagcloud')
.bind('click', function() {
var qtag = $(this).text().replace(/\s+$/g,"");
$('#ymarks_flexigrid').flexOptions({
query: qtag,
qtype: "_tags",
newp: 1
});
$('#ymarks_flexigrid').flexReload();
});
}); //close each(
}
}); //close $.ajax(
};

@ -442,11 +442,13 @@ function yacysearch(clear) {
} }
); );
function autoOpenSidebar() { function autoOpenSidebar() {
window.setTimeout(function() { // The delay prevents the sidebar to open on every intermediate search results window.setTimeout(function() { // The delay prevents the sidebar to open on every intermediate search results
if( $("#yquery").getValue() == ycurr) { // Open side bar only if result matches current search term if($("#ypopup .yloading").length == 0) { // Check again wether a search result is still loading
$("#yside").dialog('open'); if( $("#yquery").getValue() == ycurr) { // Open side bar only if result matches current search term
$("#yquery").focus(); $("#yside").dialog('open');
} $("#yquery").focus();
}
}
} , 1000); } , 1000);
} }
function cancelNavigators(ynavigators, appendTo) { // Include checkboxes to release navigators function cancelNavigators(ynavigators, appendTo) { // Include checkboxes to release navigators
@ -454,7 +456,7 @@ function yacysearch(clear) {
var query = $("#yquery").getValue(); var query = $("#yquery").getValue();
for ( var i=0, len=arLen; i<len; ++i ){ for ( var i=0, len=arLen; i<len; ++i ){
if(query.indexOf(ynavigators[i]) != -1) // Check wether search term still contains the navigator if(query.indexOf(ynavigators[i]) != -1) // Check wether search term still contains the navigator
$(' <input type="checkbox" checked="checked" class="ynav-cancel" name="ynav'+i+'" value="'+ynavigators[i]+'"> '+ynavigators[i]+'<br>').appendTo(appendTo); $(' <input type="checkbox" checked="checked" class="ynav-cancel" name="ynav'+i+'" value="'+ynavigators[i]+'"><span class="ytxt">'+ynavigators[i]+'</span><br>').appendTo(appendTo);
else else
ynavigators.splice(i, 1); // Remove navigator from array as it has been removed manually from search term ynavigators.splice(i, 1); // Remove navigator from array as it has been removed manually from search term
} }

File diff suppressed because one or more lines are too long

@ -3,13 +3,14 @@ package de.anomic.data.ymark;
import java.io.ByteArrayInputStream; import java.io.ByteArrayInputStream;
import java.io.IOException; import java.io.IOException;
import java.net.MalformedURLException; import java.net.MalformedURLException;
import java.util.Arrays;
import java.util.Enumeration; import java.util.Enumeration;
import java.util.HashSet;
import java.util.Iterator; import java.util.Iterator;
import java.util.Map; import java.util.Map;
import java.util.TreeMap; import java.util.TreeMap;
import java.util.TreeSet; import java.util.TreeSet;
import java.util.concurrent.ArrayBlockingQueue; import java.util.concurrent.ArrayBlockingQueue;
import net.yacy.cora.document.UTF8; import net.yacy.cora.document.UTF8;
import net.yacy.cora.services.federated.yacy.CacheStrategy; import net.yacy.cora.services.federated.yacy.CacheStrategy;
import net.yacy.document.Condenser; import net.yacy.document.Condenser;
@ -28,6 +29,8 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
public final static String SPACE = " "; public final static String SPACE = " ";
public final static String POISON = ""; public final static String POISON = "";
public final static HashSet<String> stopwords = new HashSet<String>(Arrays.asList(".", "!", "?", "nbsp", "uuml", "ouml", "auml", "amp", "quot", "laquo", "raquo"));
private final ArrayBlockingQueue<String> bmkQueue; private final ArrayBlockingQueue<String> bmkQueue;
private final YMarkTables ymarks; private final YMarkTables ymarks;
@ -52,7 +55,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
this.merge = true; this.merge = true;
} }
private Document loadDocument(final String url) { private static Document loadDocument(final String url, final LoaderDispatcher loader) {
DigestURI uri; DigestURI uri;
Response response; Response response;
try { try {
@ -62,16 +65,9 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
return null; return null;
} }
try { try {
response = this.loader.load(this.loader.request(uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, true); response = loader.load(loader.request(uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, true);
} catch (final IOException e) { } catch (final IOException e) {
Log.logWarning(YMarkTables.BOOKMARKS_LOG, "loadDocument failed due to IOException for url: "+url); Log.logWarning(YMarkTables.BOOKMARKS_LOG, "loadDocument failed due to IOException for url: "+url);
try {
this.ymarks.addFolder(this.bmk_user, url, "/IOExceptions");
} catch (final IOException e1) {
Log.logException(e1);
} catch (final RowSpaceExceededException e1) {
Log.logException(e1);
}
return null; return null;
} }
try { try {
@ -81,56 +77,59 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
return null; return null;
} }
} }
public String autoTag(final String url, final int max, final TreeMap<String, YMarkTag> tags) { public static String autoTag(final Document document, final int max, final TreeMap<String, YMarkTag> tags) {
final Document document = loadDocument(url);
final TreeSet<YMarkTag> topwords = new TreeSet<YMarkTag>(); final TreeSet<YMarkTag> topwords = new TreeSet<YMarkTag>();
// final TreeMap<String, YMarkTag> pairs = new TreeMap<String, YMarkTag>(); StringBuilder token;
StringBuilder token;
// StringBuilder pair = new StringBuilder(64);
if(document != null) { if(document != null) {
//get words from document //get words from document
final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib).words(); final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib).words();
// generate potential tags from document title, description and subject // generate potential tags from document title, description and subject
final int bufferSize = document.dc_title().length() + document.dc_description().length() + document.dc_subject(' ').length() + 32; final int bufferSize = document.dc_title().length() + document.dc_description().length() + document.dc_subject(' ').length() + 32;
final StringBuilder buffer = new StringBuilder(bufferSize); final StringBuilder buffer = new StringBuilder(bufferSize);
buffer.append(document.dc_title()); buffer.append(document.dc_title().toLowerCase());
buffer.append(document.dc_description()); buffer.append(document.dc_description().toLowerCase());
buffer.append(document.dc_subject(' ')); buffer.append(document.dc_subject(' ').toLowerCase());
final Enumeration<StringBuilder> tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib); final Enumeration<StringBuilder> tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib);
int count = 0; int count = 0;
// get phrases
final TreeMap<String, YMarkTag> phrases = getPhrases(document, 2);
phrases.putAll(getPhrases(document, 3));
phrases.putAll(getPhrases(document, 4));
final Iterator<String> iter = phrases.keySet().iterator();
while(iter.hasNext()) {
count = 10;
final String phrase = iter.next();
if(phrases.get(phrase).size() > 3 && phrases.get(phrase).size() < 10) {
count = phrases.get(phrase).size() * phrase.split(" ").length * 35;
}
if(isDigitSpace(phrase)) {
count = 10;
}
if(phrases.get(phrase).size() > 2 && buffer.indexOf(phrase) > 1) {
count = count * 10;
}
topwords.add(new YMarkTag(phrase, count));
}
// loop through potential tag and rank them // loop through potential tag and rank them
while(tokens.hasMoreElements()) { while(tokens.hasMoreElements()) {
count = 0; count = 0;
token = tokens.nextElement(); token = tokens.nextElement();
/*
pair.delete(0, pair.indexOf(SPACE)+1);
if(pair.length() > 1)
pair.append(SPACE);
pair.append(token);
if(pair.indexOf(SPACE) > 1 && pairs.containsKey(pair.toString())) {
pairs.get(pair.toString()).inc();
} else {
pairs.put(pair.toString(), new YMarkTag(pair.toString()));
}
*/
// check if the token appears in the text // check if the token appears in the text
if (words.containsKey(token)) { if (words.containsKey(token.toString())) {
final Word word = words.get(token); final Word word = words.get(token.toString());
// token appears in text and matches an existing bookmark tag // token appears in text and matches an existing bookmark tag
if (tags.containsKey(token)) { if (tags.containsKey(token.toString())) {
count = word.occurrences() * tags.get(token).size() * 100; count = word.occurrences() * tags.get(token.toString()).size() * 200;
} }
// token appears in text and has more than 3 characters // token appears in text and has more than 3 characters
if (token.length()>3) { else if (token.length()>3) {
count = word.occurrences() * 100; count = word.occurrences() * 100;
} }
topwords.add(new YMarkTag(token.toString(), count)); topwords.add(new YMarkTag(token.toString(), count));
@ -152,7 +151,64 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
final String clean = YMarkUtil.cleanTagsString(buffer.toString()); final String clean = YMarkUtil.cleanTagsString(buffer.toString());
return clean; return clean;
} }
return new String(); return new String();
}
private static TreeMap<String, YMarkTag> getPhrases(final Document document, final int size) {
final TreeMap<String, YMarkTag> phrases = new TreeMap<String, YMarkTag>();
final StringBuilder phrase = new StringBuilder(128);
final Enumeration<StringBuilder> tokens = new WordTokenizer(document.getText(), LibraryProvider.dymLib);
StringBuilder token;
int count = 0;
// loop through text
while(tokens.hasMoreElements()) {
token = tokens.nextElement();
if(stopwords.contains(token.toString()))
continue;
// if we have a full phrase, delete the first token
count++;
if(count > size)
phrase.delete(0, phrase.indexOf(SPACE)+1);
// append new token
if(phrase.length() > 1)
phrase.append(SPACE);
phrase.append(token);
if(count >= size) { // make sure we really have a phrase
if(phrases.containsKey(phrase.toString())) {
phrases.get(phrase.toString()).inc();
} else {
phrases.put(phrase.toString(), new YMarkTag(phrase.toString()));
}
}
}
return phrases;
}
public static String autoTag(final String url, final LoaderDispatcher loader, final int max, final TreeMap<String, YMarkTag> tags) {
final Document document = loadDocument(url, loader);
if (document != null)
return autoTag(document, max, tags);
else
return "/IOExceptions";
}
public static boolean isDigitSpace(String str) {
if (str == null) {
return false;
}
int sz = str.length();
for (int i = 0; i < sz; i++) {
if ((Character.isDigit(str.charAt(i)) == false) && (str.charAt(i) != ' ')) {
return false;
}
}
return true;
} }
public void run() { public void run() {
@ -165,8 +221,11 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
final TreeMap<String, YMarkTag> tags = this.ymarks.getTags(this.bmk_user); final TreeMap<String, YMarkTag> tags = this.ymarks.getTags(this.bmk_user);
Log.logInfo(YMarkTables.BOOKMARKS_LOG, "autoTagger queue size: "+this.bmkQueue.size()); Log.logInfo(YMarkTables.BOOKMARKS_LOG, "autoTagger queue size: "+this.bmkQueue.size());
while((url = this.bmkQueue.take()) != POISON) { while((url = this.bmkQueue.take()) != POISON) {
tagString = autoTag(url, 5, tags); tagString = autoTag(url, this.loader, 5, tags);
if (tagString.equals("/IOExceptions")) {
this.ymarks.addFolder(bmk_user, url, tagString);
tagString = "";
}
// update tags // update tags
this.ymarks.addTags(this.bmk_user, url, tagString, this.merge); this.ymarks.addTags(this.bmk_user, url, tagString, this.merge);

@ -93,12 +93,13 @@ public class YMarkMetadata {
this.indexSegment = null; this.indexSegment = null;
} }
public void loadDocument(final LoaderDispatcher loader) throws IOException, Failure { public Document loadDocument(final LoaderDispatcher loader) throws IOException, Failure {
if(this.document == null) { if(this.document == null) {
Response response = null; Response response = null;
response = loader.load(loader.request(this.uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, true); response = loader.load(loader.request(this.uri, true, false), CacheStrategy.IFEXIST, Integer.MAX_VALUE, true);
this.document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse()); this.document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
} }
return this.document;
} }
public EnumMap<METADATA, String> getMetadata() { public EnumMap<METADATA, String> getMetadata() {
@ -140,7 +141,7 @@ public class YMarkMetadata {
} }
return metadata; return metadata;
} }
public TreeMap<String,Word> getWordCounts() { public TreeMap<String,Word> getWordCounts() {
if (this.document != null) { if (this.document != null) {
return sortWordCounts(new Condenser(this.document, true, true, LibraryProvider.dymLib).words()); return sortWordCounts(new Condenser(this.document, true, true, LibraryProvider.dymLib).words());

Loading…
Cancel
Save