git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7648 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
399d7d6878
commit
78d6d6ca06
@ -1,259 +0,0 @@
|
||||
package de.anomic.data;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.TreeSet;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import net.yacy.cora.document.UTF8;
|
||||
import net.yacy.cora.storage.ConcurrentARC;
|
||||
import net.yacy.kelondro.blob.Tables;
|
||||
import net.yacy.kelondro.blob.Tables.Data;
|
||||
import net.yacy.kelondro.blob.Tables.Row;
|
||||
import net.yacy.kelondro.index.RowSpaceExceededException;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
|
||||
public class YMarkIndex {
|
||||
|
||||
public static enum INDEX {
|
||||
ID ("id", ""),
|
||||
NAME ("name", ""),
|
||||
DESC ("desc", ""),
|
||||
URLS ("urls", "");
|
||||
|
||||
private String key;
|
||||
private String dflt;
|
||||
|
||||
private INDEX(String k, String s) {
|
||||
this.key = k;
|
||||
this.dflt = s;
|
||||
}
|
||||
public String key() {
|
||||
return this.key;
|
||||
}
|
||||
public String deflt() {
|
||||
return this.dflt;
|
||||
}
|
||||
}
|
||||
|
||||
public static enum INDEX_ACTION {
|
||||
ADD,
|
||||
REMOVE
|
||||
}
|
||||
|
||||
public final static String PATTERN_PREFIX = "^\\Q";
|
||||
public final static String PATTERN_POSTFIX = YMarkTables.FOLDERS_SEPARATOR+"\\E.*$";
|
||||
|
||||
private final WorkTables worktables;
|
||||
private final String table_basename;
|
||||
private final ConcurrentARC<String, byte[]> cache;
|
||||
|
||||
public YMarkIndex(final Tables wt, final String tb) {
|
||||
this.worktables = (WorkTables)wt;
|
||||
this.table_basename = tb;
|
||||
this.cache = new ConcurrentARC<String, byte[]>(50,1);
|
||||
}
|
||||
|
||||
public String getKeyname(final String user, final byte[] key) throws IOException, RowSpaceExceededException {
|
||||
final String index_table = user + this.table_basename;
|
||||
Tables.Row row = this.worktables.select(index_table, key);
|
||||
return row.get(INDEX.NAME.key(), INDEX.NAME.deflt());
|
||||
}
|
||||
|
||||
public Iterator<String> getFolders(final String user, final String root) throws IOException {
|
||||
final String index_table = user + this.table_basename;
|
||||
final TreeSet<String> folders = new TreeSet<String>();
|
||||
final Pattern r = Pattern.compile(PATTERN_PREFIX + root + PATTERN_POSTFIX);
|
||||
final Iterator<Row> it = this.worktables.iterator(index_table, INDEX.NAME.key(), r);
|
||||
final StringBuilder path = new StringBuilder(100);
|
||||
Row folder;
|
||||
|
||||
while (it.hasNext()) {
|
||||
folder = it.next();
|
||||
path.setLength(0);
|
||||
path.append(folder.get(INDEX.NAME.key(), INDEX.NAME.deflt()));
|
||||
//TODO: get rid of .toString.equals()
|
||||
while(path.length() > 0 && !path.toString().equals(root)){
|
||||
folders.add(path.toString());
|
||||
path.setLength(path.lastIndexOf(YMarkTables.FOLDERS_SEPARATOR));
|
||||
}
|
||||
}
|
||||
if (!root.equals(YMarkTables.FOLDERS_ROOT)) { folders.add(root); }
|
||||
return folders.iterator();
|
||||
}
|
||||
|
||||
protected void clearCache() {
|
||||
this.cache.clear();
|
||||
}
|
||||
|
||||
protected void createIndexEntry(final String user, final String keyname, final HashSet<String> urlSet) throws IOException {
|
||||
final byte[] key = YMarkTables.getKeyId(keyname);
|
||||
final String index_table = user + this.table_basename;
|
||||
final String cacheKey = index_table+":"+keyname;
|
||||
final byte[] BurlSet = YMarkTables.keySetToBytes(urlSet);
|
||||
Data tagEntry = new Data();
|
||||
this.cache.insert(cacheKey, BurlSet);
|
||||
tagEntry.put(INDEX.NAME.key, keyname);
|
||||
tagEntry.put(INDEX.URLS.key, BurlSet);
|
||||
this.worktables.insert(index_table, key, tagEntry);
|
||||
}
|
||||
|
||||
protected void removeIndexEntry(final String user, String keysString, final byte[] urlHash) {
|
||||
final String[] keyArray = keysString.split(YMarkTables.TAGS_SEPARATOR);
|
||||
for (final String key : keyArray) {
|
||||
this.updateIndexTable(user, key, urlHash, INDEX_ACTION.REMOVE);
|
||||
}
|
||||
}
|
||||
|
||||
protected void insertIndexEntry(final String user, String keysString, final byte[] urlHash) {
|
||||
final String[] keyArray = keysString.split(YMarkTables.TAGS_SEPARATOR);
|
||||
for (final String key : keyArray) {
|
||||
this.updateIndexTable(user, key, urlHash, INDEX_ACTION.ADD);
|
||||
}
|
||||
}
|
||||
|
||||
protected void updateIndexEntry(final String user, final byte[] urlHash, final HashSet<String> oldSet, final HashSet<String> newSet) {
|
||||
Iterator <String> tagIter;
|
||||
HashSet<String> urlSet = new HashSet<String>(newSet);
|
||||
newSet.removeAll(oldSet);
|
||||
tagIter = newSet.iterator();
|
||||
while(tagIter.hasNext()) {
|
||||
this.updateIndexTable(user, tagIter.next(), urlHash, INDEX_ACTION.ADD);
|
||||
}
|
||||
oldSet.removeAll(urlSet);
|
||||
tagIter=oldSet.iterator();
|
||||
while(tagIter.hasNext()) {
|
||||
this.updateIndexTable(user, tagIter.next(), urlHash, INDEX_ACTION.REMOVE);
|
||||
}
|
||||
}
|
||||
|
||||
public HashSet<String> getBookmarkIds(final String user, final String keyname) throws IOException, RowSpaceExceededException {
|
||||
final String index_table = user + this.table_basename;
|
||||
final String cacheKey = index_table+":"+keyname;
|
||||
if (this.cache.containsKey(cacheKey)) {
|
||||
return YMarkTables.keysStringToSet(UTF8.String(this.cache.get(cacheKey)));
|
||||
} else {
|
||||
final Tables.Row idx_row = this.worktables.select(index_table, YMarkTables.getKeyId(keyname));
|
||||
if (idx_row != null) {
|
||||
final byte[] keys = idx_row.get(INDEX.URLS.key);
|
||||
this.cache.put(cacheKey, keys);
|
||||
return YMarkTables.keysStringToSet(UTF8.String(keys));
|
||||
}
|
||||
}
|
||||
return new HashSet<String>();
|
||||
}
|
||||
|
||||
public Iterator<Tables.Row> getBookmarks(final String user, final String keyname) throws IOException, RowSpaceExceededException {
|
||||
final Iterator<String> bit = getBookmarkIds(user, keyname).iterator();
|
||||
final HashSet<Tables.Row> bookmarks = new HashSet<Tables.Row>();
|
||||
while(bit.hasNext()) {
|
||||
bookmarks.add(this.worktables.select(YMarkTables.TABLES.BOOKMARKS.tablename(user), bit.next().getBytes()));
|
||||
}
|
||||
return bookmarks.iterator();
|
||||
}
|
||||
|
||||
public HashSet<String> getBookmarkIds(final String user, final String[] keyArray) throws IOException, RowSpaceExceededException {
|
||||
final HashSet<String> urlSet = new HashSet<String>();
|
||||
urlSet.addAll(getBookmarkIds(user, keyArray[0]));
|
||||
if (urlSet.isEmpty())
|
||||
return urlSet;
|
||||
if (keyArray.length > 1) {
|
||||
for (final String keyname : keyArray) {
|
||||
urlSet.retainAll(getBookmarkIds(user, keyname));
|
||||
if (urlSet.isEmpty())
|
||||
return urlSet;
|
||||
}
|
||||
}
|
||||
return urlSet;
|
||||
}
|
||||
|
||||
public void rebuildIndex(final String bmk_user) throws IOException {
|
||||
final Iterator<Tables.Row> plainIterator = this.worktables.iterator(YMarkTables.TABLES.BOOKMARKS.tablename(bmk_user));
|
||||
this.clearCache();
|
||||
this.worktables.clear(bmk_user + this.table_basename);
|
||||
while (plainIterator.hasNext()) {
|
||||
Tables.Row row = plainIterator.next();
|
||||
if (row != null && row.containsKey(this.table_basename.substring(1))) {
|
||||
final String url = UTF8.String(row.get(YMarkTables.BOOKMARK.URL.key()));
|
||||
final String key = this.table_basename.substring(1);
|
||||
final String keysString = row.get(key, YMarkTables.BOOKMARK.get(key).deflt());
|
||||
this.insertIndexEntry(bmk_user, keysString, YMarkTables.getBookmarkId(url));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* YMark function that updates the tag/folder index
|
||||
* @param user
|
||||
* @param keyname
|
||||
* @param url is the url has as returned by DigestURI.hash()
|
||||
* @param action is either add (1) or remove (2)
|
||||
*/
|
||||
protected void updateIndexTable(final String user, final String keyname, final byte[] url, final INDEX_ACTION action) {
|
||||
final String index_table = user + this.table_basename;
|
||||
final String cacheKey = index_table+":"+keyname;
|
||||
final byte[] key = YMarkTables.getKeyId(keyname);
|
||||
final String urlHash = UTF8.String(url);
|
||||
Tables.Row row = null;
|
||||
|
||||
// try to load urlSet from cache
|
||||
HashSet<String>urlSet = this.cache.containsKey(cacheKey) ? YMarkTables.keysStringToSet(UTF8.String(this.cache.get(cacheKey))) : new HashSet<String>();
|
||||
|
||||
try {
|
||||
row = this.worktables.select(index_table, key);
|
||||
|
||||
// key has no index_table entry
|
||||
if(row == null) {
|
||||
switch (action) {
|
||||
case ADD:
|
||||
urlSet.add(urlHash);
|
||||
createIndexEntry(user, keyname, urlSet);
|
||||
break;
|
||||
case REMOVE:
|
||||
// key has no index_table entry but a cache entry
|
||||
// TODO: this shouldn't happen
|
||||
if(!urlSet.isEmpty()) {
|
||||
urlSet.remove(urlHash);
|
||||
createIndexEntry(user, keyname, urlSet);
|
||||
}
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
}
|
||||
// key has an existing index_table entry
|
||||
else {
|
||||
byte[] BurlSet = null;
|
||||
// key has no cache entry
|
||||
if (urlSet.isEmpty()) {
|
||||
// load urlSet from index_table
|
||||
urlSet = YMarkTables.keysStringToSet(UTF8.String(row.get(INDEX.URLS.key)));
|
||||
}
|
||||
switch (action) {
|
||||
case ADD:
|
||||
urlSet.add(urlHash);
|
||||
break;
|
||||
case REMOVE:
|
||||
urlSet.remove(urlHash);
|
||||
break;
|
||||
default:
|
||||
break;
|
||||
}
|
||||
if (urlSet.isEmpty()) {
|
||||
this.cache.remove(cacheKey);
|
||||
this.worktables.delete(index_table, key);
|
||||
} else {
|
||||
BurlSet = YMarkTables.keySetToBytes(urlSet);
|
||||
this.cache.insert(cacheKey, BurlSet);
|
||||
row.put(INDEX.URLS.key, BurlSet);
|
||||
this.worktables.update(index_table, row);
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
Log.logException(e);
|
||||
} catch (RowSpaceExceededException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
}
|
||||
}
|
@ -1,517 +0,0 @@
|
||||
package de.anomic.data;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.ArrayList;
|
||||
import java.util.Date;
|
||||
import java.util.EnumMap;
|
||||
import java.util.EnumSet;
|
||||
import java.util.Enumeration;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import net.yacy.cora.date.ISO8601Formatter;
|
||||
import net.yacy.cora.document.UTF8;
|
||||
import net.yacy.document.Condenser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.LibraryProvider;
|
||||
import net.yacy.document.WordTokenizer;
|
||||
import net.yacy.kelondro.blob.Tables;
|
||||
import net.yacy.kelondro.blob.Tables.Data;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.index.RowSpaceExceededException;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import de.anomic.search.Segment;
|
||||
|
||||
public class YMarkTables {
|
||||
|
||||
public static enum TABLES {
|
||||
BOOKMARKS ("_bookmarks"),
|
||||
TAGS ("_tags"),
|
||||
FOLDERS ("_folders");
|
||||
|
||||
private String basename;
|
||||
|
||||
private TABLES(String b) {
|
||||
this.basename = b;
|
||||
}
|
||||
public String basename() {
|
||||
return this.basename;
|
||||
}
|
||||
public String tablename(String bmk_user) {
|
||||
return bmk_user+this.basename;
|
||||
}
|
||||
}
|
||||
|
||||
public static enum PROTOCOLS {
|
||||
HTTP ("http://"),
|
||||
HTTPS ("https://");
|
||||
|
||||
private String protocol;
|
||||
|
||||
private PROTOCOLS(String s) {
|
||||
this.protocol = s;
|
||||
}
|
||||
public String protocol() {
|
||||
return this.protocol;
|
||||
}
|
||||
public String protocol(String s) {
|
||||
return this.protocol+s;
|
||||
}
|
||||
}
|
||||
|
||||
public static enum BOOKMARK {
|
||||
// key dflt html_attrb xbel_attrb type
|
||||
URL ("url", "", "href", "href", "link"),
|
||||
TITLE ("title", "", "", "", "meta"),
|
||||
DESC ("desc", "", "", "", "comment"),
|
||||
DATE_ADDED ("date_added", "", "add_date", "added", "date"),
|
||||
DATE_MODIFIED ("date_modified", "", "last_modified", "modified", "date"),
|
||||
DATE_VISITED ("date_visited", "", "last_visited", "visited", "date"),
|
||||
PUBLIC ("public", "flase", "", "yacy:public", "lock"),
|
||||
TAGS ("tags", "unsorted", "shortcuturl", "yacy:tags", "tag"),
|
||||
VISITS ("visits", "0", "", "yacy:visits", "stat"),
|
||||
FOLDERS ("folders", "/unsorted", "", "", "folder");
|
||||
|
||||
private String key;
|
||||
private String dflt;
|
||||
private String html_attrb;
|
||||
private String xbel_attrb;
|
||||
private String type;
|
||||
|
||||
private static final Map<String,BOOKMARK> lookup = new HashMap<String,BOOKMARK>();
|
||||
static {
|
||||
for(BOOKMARK b : EnumSet.allOf(BOOKMARK.class))
|
||||
lookup.put(b.key(), b);
|
||||
}
|
||||
|
||||
private static StringBuilder buffer = new StringBuilder(25);;
|
||||
|
||||
private BOOKMARK(String k, String s, String a, String x, String t) {
|
||||
this.key = k;
|
||||
this.dflt = s;
|
||||
this.html_attrb = a;
|
||||
this.xbel_attrb = x;
|
||||
this.type = t;
|
||||
}
|
||||
public static BOOKMARK get(String key) {
|
||||
return lookup.get(key);
|
||||
}
|
||||
public static boolean contains(String key) {
|
||||
return lookup.containsKey(key);
|
||||
}
|
||||
public String key() {
|
||||
return this.key;
|
||||
}
|
||||
public String deflt() {
|
||||
return this.dflt;
|
||||
}
|
||||
public String html_attrb() {
|
||||
return this.html_attrb;
|
||||
}
|
||||
public String xbel_attrb() {
|
||||
return this.xbel_attrb;
|
||||
}
|
||||
public String xbel() {
|
||||
buffer.setLength(0);
|
||||
buffer.append('"');
|
||||
buffer.append('\n');
|
||||
buffer.append(' ');
|
||||
buffer.append(this.xbel_attrb);
|
||||
buffer.append('=');
|
||||
buffer.append('"');
|
||||
return buffer.toString();
|
||||
}
|
||||
public String type() {
|
||||
return this.type;
|
||||
}
|
||||
}
|
||||
|
||||
public enum METADATA {
|
||||
TITLE,
|
||||
DESCRIPTION,
|
||||
FAVICON,
|
||||
KEYWORDS,
|
||||
LANGUAGE,
|
||||
CREATOR,
|
||||
PUBLISHER,
|
||||
CHARSET,
|
||||
MIMETYPE,
|
||||
SIZE,
|
||||
WORDCOUNT,
|
||||
IN_URLDB,
|
||||
FRESHDATE,
|
||||
LOADDATE,
|
||||
MODDATE,
|
||||
SNIPPET
|
||||
}
|
||||
|
||||
public final static HashMap<String,String> POISON = new HashMap<String,String>();
|
||||
|
||||
public final static String TAGS_SEPARATOR = ",";
|
||||
|
||||
public final static String FOLDERS_SEPARATOR = "/";
|
||||
public final static String FOLDERS_ROOT = "/";
|
||||
public final static String FOLDERS_UNSORTED = "/unsorted";
|
||||
public final static String FOLDERS_IMPORTED = "/imported";
|
||||
public static final int FOLDER_BUFFER_SIZE = 100;
|
||||
|
||||
public final static String BOOKMARKS_LOG = "BOOKMARKS";
|
||||
public final static String BOOKMARKS_ID = "id";
|
||||
|
||||
public final static String USER_ADMIN = "admin";
|
||||
public final static String USER_AUTHENTICATE = "AUTHENTICATE";
|
||||
public final static String USER_AUTHENTICATE_MSG = "Authentication required!";
|
||||
|
||||
private WorkTables worktables;
|
||||
public YMarkIndex tags;
|
||||
public YMarkIndex folders;
|
||||
|
||||
public YMarkTables(final Tables wt) {
|
||||
this.worktables = (WorkTables)wt;
|
||||
this.folders = new YMarkIndex(this.worktables, TABLES.FOLDERS.basename());
|
||||
this.tags = new YMarkIndex(this.worktables, TABLES.TAGS.basename());
|
||||
}
|
||||
|
||||
public static Date parseISO8601(final String s) throws ParseException {
|
||||
if(s == null || s.length() < 1) {
|
||||
throw new ParseException("parseISO8601 - empty string, nothing to parse", 0);
|
||||
}
|
||||
SimpleDateFormat dateformat;
|
||||
StringBuilder date = new StringBuilder(s);
|
||||
if(s.length()==10)
|
||||
dateformat = new SimpleDateFormat("yyyy-MM-dd");
|
||||
else {
|
||||
dateformat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssz");
|
||||
if(date.charAt(date.length()-1) == 'Z') {
|
||||
date.deleteCharAt(date.length()-1);
|
||||
date.append("GMT-00:00");
|
||||
} else {
|
||||
date.insert(date.length()-6, "GMT");
|
||||
}
|
||||
}
|
||||
return dateformat.parse(date.toString());
|
||||
}
|
||||
|
||||
public static String getISO8601(final byte[] date) {
|
||||
if(date != null) {
|
||||
final String s = UTF8.String(date);
|
||||
if(s != null && s.length() > 0)
|
||||
return ISO8601Formatter.FORMATTER.format(new Date(Long.parseLong(s)));
|
||||
}
|
||||
return "";
|
||||
}
|
||||
|
||||
public final static byte[] getBookmarkId(String url) throws MalformedURLException {
|
||||
return (new DigestURI(url, null)).hash();
|
||||
}
|
||||
|
||||
public final static byte[] getKeyId(final String tag) {
|
||||
return Word.word2hash(tag.toLowerCase());
|
||||
}
|
||||
|
||||
public final static byte[] keySetToBytes(final HashSet<String> urlSet) {
|
||||
return keySetToString(urlSet).getBytes();
|
||||
}
|
||||
|
||||
public final static String keySetToString(final HashSet<String> urlSet) {
|
||||
final Iterator<String> urlIter = urlSet.iterator();
|
||||
final
|
||||
StringBuilder urls = new StringBuilder(urlSet.size()*20);
|
||||
while(urlIter.hasNext()) {
|
||||
urls.append(TAGS_SEPARATOR);
|
||||
urls.append(urlIter.next());
|
||||
}
|
||||
urls.deleteCharAt(0);
|
||||
return urls.toString();
|
||||
}
|
||||
|
||||
public final static HashSet<String> keysStringToSet(final String keysString) {
|
||||
HashSet<String> keySet = new HashSet<String>();
|
||||
final String[] keyArray = keysString.split(TAGS_SEPARATOR);
|
||||
for (final String key : keyArray) {
|
||||
keySet.add(key);
|
||||
}
|
||||
return keySet;
|
||||
}
|
||||
|
||||
public final static String cleanTagsString(final String tagsString) {
|
||||
StringBuilder ts = new StringBuilder(tagsString);
|
||||
if(ts.length() == 0)
|
||||
return YMarkTables.BOOKMARK.TAGS.deflt();
|
||||
// get rid of double commas and space characters following a comma
|
||||
for (int i = 0; i < ts.length()-1; i++) {
|
||||
if (ts.charAt(i) == TAGS_SEPARATOR.charAt(0)) {
|
||||
if (ts.charAt(i+1) == TAGS_SEPARATOR.charAt(0) || ts.charAt(i+1) == ' ') {
|
||||
ts.deleteCharAt(i+1);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
}
|
||||
// get rid of heading and trailing comma
|
||||
if (ts.charAt(0) == TAGS_SEPARATOR.charAt(0))
|
||||
ts.deleteCharAt(0);
|
||||
if (ts.charAt(ts.length()-1) == TAGS_SEPARATOR.charAt(0))
|
||||
ts.deleteCharAt(ts.length()-1);
|
||||
return ts.toString();
|
||||
}
|
||||
|
||||
public final static String cleanFoldersString(final String foldersString) {
|
||||
StringBuilder fs = new StringBuilder(cleanTagsString(foldersString));
|
||||
if(fs.length() == 0)
|
||||
return YMarkTables.BOOKMARK.FOLDERS.deflt();
|
||||
for (int i = 0; i < fs.length()-1; i++) {
|
||||
if (fs.charAt(i) == FOLDERS_SEPARATOR.charAt(0)) {
|
||||
if (fs.charAt(i+1) == TAGS_SEPARATOR.charAt(0) || fs.charAt(i+1) == FOLDERS_SEPARATOR.charAt(0)) {
|
||||
fs.deleteCharAt(i);
|
||||
i--;
|
||||
} else if (fs.charAt(i+1) == ' ') {
|
||||
fs.deleteCharAt(i+1);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (fs.charAt(fs.length()-1) == FOLDERS_SEPARATOR.charAt(0)) {
|
||||
fs.deleteCharAt(fs.length()-1);
|
||||
}
|
||||
return fs.toString();
|
||||
}
|
||||
|
||||
public void clearIndex(String tablename) {
|
||||
if (tablename.endsWith(TABLES.TAGS.basename()))
|
||||
this.tags.clearCache();
|
||||
if (tablename.endsWith(TABLES.FOLDERS.basename()))
|
||||
this.folders.clearCache();
|
||||
}
|
||||
|
||||
public void deleteBookmark(final String bmk_user, final byte[] urlHash) throws IOException, RowSpaceExceededException {
|
||||
final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user);
|
||||
Tables.Row bmk_row = null;
|
||||
bmk_row = this.worktables.select(bmk_table, urlHash);
|
||||
if(bmk_row != null) {
|
||||
final String tagsString = bmk_row.get(YMarkTables.BOOKMARK.TAGS.key(),YMarkTables.BOOKMARK.TAGS.deflt());
|
||||
tags.removeIndexEntry(bmk_user, tagsString, urlHash);
|
||||
final String foldersString = bmk_row.get(YMarkTables.BOOKMARK.FOLDERS.key(),YMarkTables.FOLDERS_ROOT);
|
||||
folders.removeIndexEntry(bmk_user, foldersString, urlHash);
|
||||
this.worktables.delete(bmk_table,urlHash);
|
||||
}
|
||||
}
|
||||
|
||||
public void deleteBookmark(final String bmk_user, final String url) throws IOException, RowSpaceExceededException {
|
||||
this.deleteBookmark(bmk_user, getBookmarkId(url));
|
||||
}
|
||||
|
||||
public void addBookmark(final String bmk_user, final HashMap<String,String> bmk, final boolean importer) throws IOException, RowSpaceExceededException {
|
||||
final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user);
|
||||
final String date = String.valueOf(System.currentTimeMillis());
|
||||
final byte[] urlHash = getBookmarkId(bmk.get(BOOKMARK.URL.key()));
|
||||
Tables.Row bmk_row = null;
|
||||
|
||||
if (urlHash != null) {
|
||||
bmk_row = this.worktables.select(bmk_table, urlHash);
|
||||
if (bmk_row == null) {
|
||||
// create and insert new entry
|
||||
final Data data = new Data();
|
||||
for (BOOKMARK b : BOOKMARK.values()) {
|
||||
switch(b) {
|
||||
case DATE_ADDED:
|
||||
case DATE_MODIFIED:
|
||||
if(bmk.containsKey(b.key()) && bmk.get(b.key()) != null) {
|
||||
data.put(b.key(), bmk.get(b.key()));
|
||||
} else {
|
||||
data.put(b.key(), String.valueOf(System.currentTimeMillis()).getBytes());
|
||||
}
|
||||
break;
|
||||
case TAGS:
|
||||
if(bmk.containsKey(b.key()) && bmk.get(b.key()) != null) {
|
||||
this.tags.insertIndexEntry(bmk_user, bmk.get(b.key()), urlHash);
|
||||
data.put(b.key(), bmk.get(b.key()));
|
||||
} else {
|
||||
this.tags.insertIndexEntry(bmk_user, b.deflt(), urlHash);
|
||||
data.put(b.key(), b.deflt());
|
||||
}
|
||||
break;
|
||||
case FOLDERS:
|
||||
if(bmk.containsKey(b.key()) && bmk.get(b.key()) != null) {
|
||||
this.folders.insertIndexEntry(bmk_user, bmk.get(b.key()), urlHash);
|
||||
data.put(b.key(), bmk.get(b.key()));
|
||||
} else {
|
||||
this.folders.insertIndexEntry(bmk_user, b.deflt(), urlHash);
|
||||
data.put(b.key(), b.deflt());
|
||||
}
|
||||
break;
|
||||
default:
|
||||
if(bmk.containsKey(b.key()) && bmk.get(b.key()) != null) {
|
||||
data.put(b.key(), bmk.get(b.key()));
|
||||
}
|
||||
}
|
||||
}
|
||||
this.worktables.insert(bmk_table, urlHash, data);
|
||||
} else {
|
||||
// modify and update existing entry
|
||||
HashSet<String> oldSet;
|
||||
HashSet<String> newSet;
|
||||
for (BOOKMARK b : BOOKMARK.values()) {
|
||||
switch(b) {
|
||||
case DATE_ADDED:
|
||||
if(!bmk_row.containsKey(b.key))
|
||||
bmk_row.put(b.key(), date);
|
||||
break;
|
||||
case DATE_MODIFIED:
|
||||
bmk_row.put(b.key(), date);
|
||||
break;
|
||||
case TAGS:
|
||||
oldSet = keysStringToSet(bmk_row.get(b.key(),b.deflt()));
|
||||
if(bmk.containsKey(b.key())) {
|
||||
newSet = keysStringToSet(bmk.get(b.key()));
|
||||
if(importer) {
|
||||
newSet.addAll(oldSet);
|
||||
bmk_row.put(b.key(), keySetToString(newSet));
|
||||
oldSet.clear();
|
||||
} else {
|
||||
bmk_row.put(b.key, bmk.get(b.key()));
|
||||
}
|
||||
} else {
|
||||
newSet = new HashSet<String>();
|
||||
bmk_row.put(b.key, bmk_row.get(b.key(), b.deflt()));
|
||||
}
|
||||
this.tags.updateIndexEntry(bmk_user, urlHash, oldSet, newSet);
|
||||
break;
|
||||
case FOLDERS:
|
||||
oldSet = keysStringToSet(bmk_row.get(b.key(),b.deflt()));
|
||||
if(bmk.containsKey(b.key())) {
|
||||
newSet = keysStringToSet(bmk.get(b.key()));
|
||||
if(importer) {
|
||||
newSet.addAll(oldSet);
|
||||
bmk_row.put(b.key(), keySetToString(newSet));
|
||||
oldSet.clear();
|
||||
} else {
|
||||
bmk_row.put(b.key, bmk.get(b.key()));
|
||||
}
|
||||
} else {
|
||||
newSet = new HashSet<String>();
|
||||
bmk_row.put(b.key, bmk_row.get(b.key(), b.deflt()));
|
||||
}
|
||||
this.folders.updateIndexEntry(bmk_user, urlHash, oldSet, newSet);
|
||||
break;
|
||||
default:
|
||||
if(bmk.containsKey(b.key())) {
|
||||
bmk_row.put(b.key, bmk.get(b.key()));
|
||||
} else {
|
||||
bmk_row.put(b.key, bmk_row.get(b.key(), b.deflt()));
|
||||
}
|
||||
}
|
||||
}
|
||||
// update bmk_table
|
||||
this.worktables.update(bmk_table, bmk_row);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
public static EnumMap<METADATA, String> getMetadata(final byte[] urlHash, final Segment indexSegment) {
|
||||
final EnumMap<METADATA, String> metadata = new EnumMap<METADATA, String>(METADATA.class);
|
||||
final URIMetadataRow urlEntry = indexSegment.urlMetadata().load(urlHash, null, 0);
|
||||
if (urlEntry != null) {
|
||||
metadata.put(METADATA.IN_URLDB, "true");
|
||||
metadata.put(METADATA.SIZE, String.valueOf(urlEntry.size()));
|
||||
metadata.put(METADATA.FRESHDATE, ISO8601Formatter.FORMATTER.format(urlEntry.freshdate()));
|
||||
metadata.put(METADATA.LOADDATE, ISO8601Formatter.FORMATTER.format(urlEntry.loaddate()));
|
||||
metadata.put(METADATA.MODDATE, ISO8601Formatter.FORMATTER.format(urlEntry.moddate()));
|
||||
metadata.put(METADATA.SNIPPET, String.valueOf(urlEntry.snippet()));
|
||||
metadata.put(METADATA.WORDCOUNT, String.valueOf(urlEntry.wordCount()));
|
||||
metadata.put(METADATA.MIMETYPE, String.valueOf(urlEntry.doctype()));
|
||||
metadata.put(METADATA.LANGUAGE, UTF8.String(urlEntry.language()));
|
||||
|
||||
final URIMetadataRow.Components meta = urlEntry.metadata();
|
||||
if (meta != null) {
|
||||
metadata.put(METADATA.TITLE, meta.dc_title());
|
||||
metadata.put(METADATA.CREATOR, meta.dc_creator());
|
||||
metadata.put(METADATA.KEYWORDS, meta.dc_subject());
|
||||
metadata.put(METADATA.PUBLISHER, meta.dc_publisher());
|
||||
}
|
||||
}
|
||||
return metadata;
|
||||
}
|
||||
|
||||
public static EnumMap<METADATA, String> getMetadata(final Document document) {
|
||||
final EnumMap<METADATA, String> metadata = new EnumMap<METADATA, String>(METADATA.class);
|
||||
metadata.put(METADATA.IN_URLDB, "false");
|
||||
if(document != null) {
|
||||
metadata.put(METADATA.TITLE, document.dc_title());
|
||||
metadata.put(METADATA.CREATOR, document.dc_creator());
|
||||
metadata.put(METADATA.KEYWORDS, document.dc_subject(' '));
|
||||
metadata.put(METADATA.PUBLISHER, document.dc_publisher());
|
||||
metadata.put(METADATA.DESCRIPTION, document.dc_description());
|
||||
metadata.put(METADATA.MIMETYPE, document.dc_format());
|
||||
metadata.put(METADATA.LANGUAGE, document.dc_language());
|
||||
metadata.put(METADATA.CHARSET, document.getCharset());
|
||||
// metadata.put(METADATA.SIZE, String.valueOf(document.getTextLength()));
|
||||
}
|
||||
return metadata;
|
||||
}
|
||||
|
||||
public String autoTag(final Document document, final String bmk_user, final int count) {
|
||||
final StringBuilder buffer = new StringBuilder();
|
||||
final Map<String, Word> words;
|
||||
if(document != null) {
|
||||
try {
|
||||
words = new Condenser(document, true, true, LibraryProvider.dymLib).words();
|
||||
buffer.append(document.dc_title());
|
||||
buffer.append(document.dc_description());
|
||||
buffer.append(document.dc_subject(' '));
|
||||
final Enumeration<String> tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib);
|
||||
while(tokens.hasMoreElements()) {
|
||||
int max = 1;
|
||||
String token = tokens.nextElement();
|
||||
Word word = words.get(token);
|
||||
if (words.containsKey(token)) {
|
||||
if (this.worktables.has(TABLES.TAGS.tablename(bmk_user), getKeyId(token))) {
|
||||
max = word.occurrences() * 1000;
|
||||
} else if (token.length()>3) {
|
||||
max = word.occurrences() * 100;
|
||||
}
|
||||
for(int i=0; i<max; i++) {
|
||||
word.inc();
|
||||
}
|
||||
}
|
||||
}
|
||||
buffer.setLength(0);
|
||||
final ArrayList<String> topwords = new ArrayList<String>(sortWordCounts(words).descendingKeySet());
|
||||
for(int i=0; i<count && i<topwords.size() ; i++) {
|
||||
if(words.get(topwords.get(i)).occurrences() > 100) {
|
||||
buffer.append(topwords.get(i));
|
||||
buffer.append(YMarkTables.TAGS_SEPARATOR);
|
||||
}
|
||||
}
|
||||
} catch (UnsupportedEncodingException e) {
|
||||
Log.logException(e);
|
||||
} catch (IOException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
}
|
||||
return YMarkTables.cleanTagsString(buffer.toString());
|
||||
}
|
||||
|
||||
public static TreeMap<String,Word> getWordCounts(final Document document) {
|
||||
if (document != null) {
|
||||
return sortWordCounts(new Condenser(document, true, true, LibraryProvider.dymLib).words());
|
||||
}
|
||||
return new TreeMap<String, Word>();
|
||||
}
|
||||
|
||||
public static TreeMap<String,Word> sortWordCounts(final Map<String, Word> unsorted_words) {
|
||||
final TreeMap<String, Word> sorted_words = new TreeMap<String, Word>(new YMarkWordCountComparator(unsorted_words));
|
||||
sorted_words.putAll(unsorted_words);
|
||||
return sorted_words;
|
||||
}
|
||||
|
||||
}
|
@ -1,27 +0,0 @@
|
||||
package de.anomic.data;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.Map;
|
||||
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
|
||||
public class YMarkWordCountComparator implements Comparator<String> {
|
||||
|
||||
private Map<String,Word> words;
|
||||
|
||||
public YMarkWordCountComparator(final Map<String,Word> words) {
|
||||
this.words = words;
|
||||
}
|
||||
|
||||
public int compare(final String k1, final String k2) {
|
||||
final Word w1 = this.words.get(k1);
|
||||
final Word w2 = this.words.get(k2);
|
||||
|
||||
if(w1.occurrences() > w2.occurrences())
|
||||
return 1;
|
||||
else if(w1.occurrences() < w2.occurrences())
|
||||
return -1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
}
|
@ -0,0 +1,90 @@
|
||||
// YMarkCrawlStart.java
|
||||
// (C) 2011 by Stefan Förster, sof@gmx.de, Norderstedt, Germany
|
||||
// first published 2010 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2011-03-09 13:50:39 +0100 (Mi, 09 Mrz 2011) $
|
||||
// $LastChangedRevision: 7574 $
|
||||
// $LastChangedBy: apfelmaennchen $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.data.ymark;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import net.yacy.kelondro.blob.Tables;
|
||||
|
||||
import de.anomic.data.WorkTables;
|
||||
|
||||
public class YMarkCrawlStart extends HashMap<String,String>{
|
||||
|
||||
private static final long serialVersionUID = 1L;
|
||||
private WorkTables worktables;
|
||||
|
||||
public YMarkCrawlStart(final WorkTables worktables) {
|
||||
this.worktables = worktables;
|
||||
}
|
||||
|
||||
public YMarkCrawlStart(final WorkTables worktables, final String url) {
|
||||
this.worktables = worktables;
|
||||
this.clear();
|
||||
this.load(url);
|
||||
}
|
||||
|
||||
public void load(String url) {
|
||||
try {
|
||||
final StringBuffer buffer = new StringBuffer(500);
|
||||
buffer.append("^.*crawlingURL=\\Q");
|
||||
buffer.append(url);
|
||||
buffer.append("\\E?.*");
|
||||
final Pattern pattern = Pattern.compile(buffer.toString());
|
||||
final Iterator<Tables.Row> APIcalls = this.worktables.iterator(WorkTables.TABLE_API_NAME, WorkTables.TABLE_API_COL_URL, pattern);
|
||||
Tables.Row row = null;
|
||||
while(APIcalls.hasNext()) {
|
||||
row = APIcalls.next();
|
||||
if(row.get(WorkTables.TABLE_API_COL_TYPE, "").equals("crawler")) {
|
||||
buffer.setLength(0);
|
||||
buffer.append(row.get(WorkTables.TABLE_API_COL_URL, ""));
|
||||
buffer.delete(0, buffer.indexOf("?")+1);
|
||||
int start = 0;
|
||||
int end = 0;
|
||||
String key;
|
||||
String value;
|
||||
while(start < buffer.length()) {
|
||||
end = buffer.indexOf("=", start);
|
||||
key = buffer.substring(start, end);
|
||||
start = end+1;
|
||||
end = buffer.indexOf("&", start);
|
||||
if(end < 0 || end > buffer.length())
|
||||
end = buffer.length()-1;
|
||||
value = buffer.substring(start, end);
|
||||
start = end+1;
|
||||
this.put(key, value);
|
||||
}
|
||||
break;
|
||||
}
|
||||
}
|
||||
} catch (IOException e) {
|
||||
// TODO Auto-generated catch block
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,92 @@
|
||||
// YMarkDate.java
|
||||
// (C) 2011 by Stefan Förster, sof@gmx.de, Norderstedt, Germany
|
||||
// first published 2010 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2011-03-09 13:50:39 +0100 (Mi, 09 Mrz 2011) $
|
||||
// $LastChangedRevision: 7574 $
|
||||
// $LastChangedBy: apfelmaennchen $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.data.ymark;
|
||||
|
||||
import java.text.ParseException;
|
||||
import java.text.SimpleDateFormat;
|
||||
import java.util.Date;
|
||||
|
||||
import net.yacy.cora.date.ISO8601Formatter;
|
||||
import net.yacy.cora.document.UTF8;
|
||||
|
||||
public class YMarkDate {
|
||||
|
||||
private long date;
|
||||
|
||||
public YMarkDate() {
|
||||
this.date = System.currentTimeMillis();
|
||||
}
|
||||
|
||||
public YMarkDate(final byte[] date) {
|
||||
this.set(date);
|
||||
}
|
||||
|
||||
public long parseISO8601(final String s) throws ParseException {
|
||||
if(s == null || s.length() < 1) {
|
||||
throw new ParseException("parseISO8601 - empty string, nothing to parse", 0);
|
||||
}
|
||||
SimpleDateFormat dateformat;
|
||||
StringBuilder date = new StringBuilder(s);
|
||||
if(s.length()==10)
|
||||
dateformat = new SimpleDateFormat("yyyy-MM-dd");
|
||||
else {
|
||||
dateformat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssz");
|
||||
if(date.charAt(date.length()-1) == 'Z') {
|
||||
date.deleteCharAt(date.length()-1);
|
||||
date.append("GMT-00:00");
|
||||
} else {
|
||||
date.insert(date.length()-6, "GMT");
|
||||
}
|
||||
}
|
||||
this.date = dateformat.parse(date.toString()).getTime();
|
||||
return this.date;
|
||||
}
|
||||
|
||||
public String toISO8601() {
|
||||
return ISO8601Formatter.FORMATTER.format(new Date(this.date));
|
||||
}
|
||||
|
||||
public byte[] toBytes() {
|
||||
return String.valueOf(this.date).getBytes();
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return String.valueOf(this.date);
|
||||
}
|
||||
|
||||
public long get() {
|
||||
return this.date;
|
||||
}
|
||||
|
||||
public void set(long date) {
|
||||
this.date = date;
|
||||
}
|
||||
|
||||
public void set(byte[] date) {
|
||||
this.date = Long.parseLong(UTF8.String(date));
|
||||
}
|
||||
}
|
@ -0,0 +1,201 @@
|
||||
// YMarkMetadata.java
|
||||
// (C) 2011 by Stefan Förster, sof@gmx.de, Norderstedt, Germany
|
||||
// first published 2010 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2011-03-09 13:50:39 +0100 (Mi, 09 Mrz 2011) $
|
||||
// $LastChangedRevision: 7574 $
|
||||
// $LastChangedBy: apfelmaennchen $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.data.ymark;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.EnumMap;
|
||||
import java.util.Enumeration;
|
||||
import java.util.Map;
|
||||
import java.util.TreeMap;
|
||||
|
||||
import net.yacy.cora.date.ISO8601Formatter;
|
||||
import net.yacy.cora.document.UTF8;
|
||||
import net.yacy.document.Condenser;
|
||||
import net.yacy.document.Document;
|
||||
import net.yacy.document.LibraryProvider;
|
||||
import net.yacy.document.WordTokenizer;
|
||||
import net.yacy.document.Parser.Failure;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.repository.LoaderDispatcher;
|
||||
import de.anomic.crawler.CrawlProfile;
|
||||
import de.anomic.crawler.retrieval.Response;
|
||||
import de.anomic.search.Segments;
|
||||
|
||||
public class YMarkMetadata {
|
||||
private DigestURI uri;
|
||||
Document document;
|
||||
Segments indexSegment;
|
||||
|
||||
public enum METADATA {
|
||||
TITLE,
|
||||
DESCRIPTION,
|
||||
FAVICON,
|
||||
KEYWORDS,
|
||||
LANGUAGE,
|
||||
CREATOR,
|
||||
PUBLISHER,
|
||||
CHARSET,
|
||||
MIMETYPE,
|
||||
SIZE,
|
||||
WORDCOUNT,
|
||||
IN_URLDB,
|
||||
FRESHDATE,
|
||||
LOADDATE,
|
||||
MODDATE,
|
||||
SNIPPET,
|
||||
AUTOTAG
|
||||
}
|
||||
|
||||
public YMarkMetadata(final DigestURI uri) {
|
||||
this.uri = uri;
|
||||
this.document = null;
|
||||
this.indexSegment = null;
|
||||
}
|
||||
|
||||
public YMarkMetadata(final DigestURI uri, final Segments indexSegment) {
|
||||
this.uri = uri;
|
||||
this.document = null;
|
||||
this.indexSegment = indexSegment;
|
||||
}
|
||||
|
||||
public YMarkMetadata(final Document document) {
|
||||
this.document = document;
|
||||
try {
|
||||
this.uri = new DigestURI(this.document.dc_identifier());
|
||||
} catch (MalformedURLException e) {
|
||||
this.uri = null;
|
||||
}
|
||||
this.indexSegment = null;
|
||||
}
|
||||
|
||||
public void loadDocument(LoaderDispatcher loader) throws IOException, Failure {
|
||||
if(document == null) {
|
||||
Response response = null;
|
||||
response = loader.load(loader.request(this.uri, true, false), CrawlProfile.CacheStrategy.IFEXIST, Long.MAX_VALUE, true);
|
||||
this.document = Document.mergeDocuments(response.url(), response.getMimeType(), response.parse());
|
||||
}
|
||||
}
|
||||
|
||||
public EnumMap<METADATA, String> getMetadata() {
|
||||
final EnumMap<METADATA, String> metadata = new EnumMap<METADATA, String>(METADATA.class);
|
||||
final URIMetadataRow urlEntry = this.indexSegment.segment(Segments.Process.PUBLIC).urlMetadata().load(this.uri.hash(), null, 0);
|
||||
if (urlEntry != null) {
|
||||
metadata.put(METADATA.SIZE, String.valueOf(urlEntry.size()));
|
||||
metadata.put(METADATA.FRESHDATE, ISO8601Formatter.FORMATTER.format(urlEntry.freshdate()));
|
||||
metadata.put(METADATA.LOADDATE, ISO8601Formatter.FORMATTER.format(urlEntry.loaddate()));
|
||||
metadata.put(METADATA.MODDATE, ISO8601Formatter.FORMATTER.format(urlEntry.moddate()));
|
||||
metadata.put(METADATA.SNIPPET, String.valueOf(urlEntry.snippet()));
|
||||
metadata.put(METADATA.WORDCOUNT, String.valueOf(urlEntry.wordCount()));
|
||||
metadata.put(METADATA.MIMETYPE, String.valueOf(urlEntry.doctype()));
|
||||
metadata.put(METADATA.LANGUAGE, UTF8.String(urlEntry.language()));
|
||||
|
||||
final URIMetadataRow.Components meta = urlEntry.metadata();
|
||||
if (meta != null) {
|
||||
metadata.put(METADATA.TITLE, meta.dc_title());
|
||||
metadata.put(METADATA.CREATOR, meta.dc_creator());
|
||||
metadata.put(METADATA.KEYWORDS, meta.dc_subject());
|
||||
metadata.put(METADATA.PUBLISHER, meta.dc_publisher());
|
||||
}
|
||||
}
|
||||
return metadata;
|
||||
}
|
||||
|
||||
public EnumMap<METADATA, String> loadMetadata() {
|
||||
final EnumMap<METADATA, String> metadata = new EnumMap<METADATA, String>(METADATA.class);
|
||||
if(this.document != null) {
|
||||
metadata.put(METADATA.TITLE, this.document.dc_title());
|
||||
metadata.put(METADATA.CREATOR, this.document.dc_creator());
|
||||
metadata.put(METADATA.KEYWORDS, this.document.dc_subject(' '));
|
||||
metadata.put(METADATA.PUBLISHER, this.document.dc_publisher());
|
||||
metadata.put(METADATA.DESCRIPTION, this.document.dc_description());
|
||||
metadata.put(METADATA.MIMETYPE, this.document.dc_format());
|
||||
metadata.put(METADATA.LANGUAGE, this.document.dc_language());
|
||||
metadata.put(METADATA.CHARSET, this.document.getCharset());
|
||||
// metadata.put(METADATA.SIZE, String.valueOf(document.getTextLength()));
|
||||
metadata.put(METADATA.AUTOTAG, this.autoTag(5));
|
||||
}
|
||||
return metadata;
|
||||
}
|
||||
|
||||
public String autoTag(final int count) {
|
||||
final StringBuilder buffer = new StringBuilder();
|
||||
final Map<String, Word> words;
|
||||
if(this.document != null) {
|
||||
words = new Condenser(this.document, true, true, LibraryProvider.dymLib).words();
|
||||
buffer.append(this.document.dc_title());
|
||||
buffer.append(this.document.dc_description());
|
||||
buffer.append(this.document.dc_subject(' '));
|
||||
final Enumeration<String> tokens = new WordTokenizer(new ByteArrayInputStream(UTF8.getBytes(buffer.toString())), LibraryProvider.dymLib);
|
||||
while(tokens.hasMoreElements()) {
|
||||
int max = 1;
|
||||
String token = tokens.nextElement();
|
||||
Word word = words.get(token);
|
||||
if (words.containsKey(token)) {
|
||||
/*
|
||||
if (this.worktables.has(TABLES.TAGS.tablename(bmk_user), YMarkUtil.getKeyId(token))) {
|
||||
max = word.occurrences() * 1000;
|
||||
} else
|
||||
*/
|
||||
if (token.length()>3) {
|
||||
max = word.occurrences() * 100;
|
||||
}
|
||||
for(int i=0; i<max; i++) {
|
||||
word.inc();
|
||||
}
|
||||
}
|
||||
}
|
||||
buffer.setLength(0);
|
||||
final ArrayList<String> topwords = new ArrayList<String>(sortWordCounts(words).descendingKeySet());
|
||||
for(int i=0; i<count && i<topwords.size() ; i++) {
|
||||
if(words.get(topwords.get(i)).occurrences() > 100) {
|
||||
buffer.append(topwords.get(i));
|
||||
buffer.append(YMarkUtil.TAGS_SEPARATOR);
|
||||
}
|
||||
}
|
||||
}
|
||||
return YMarkUtil.cleanTagsString(buffer.toString());
|
||||
}
|
||||
|
||||
public TreeMap<String,Word> getWordCounts() {
|
||||
if (this.document != null) {
|
||||
return sortWordCounts(new Condenser(this.document, true, true, LibraryProvider.dymLib).words());
|
||||
}
|
||||
return new TreeMap<String, Word>();
|
||||
}
|
||||
|
||||
public static TreeMap<String,Word> sortWordCounts(final Map<String, Word> unsorted_words) {
|
||||
final TreeMap<String, Word> sorted_words = new TreeMap<String, Word>(new YMarkWordCountComparator(unsorted_words));
|
||||
sorted_words.putAll(unsorted_words);
|
||||
return sorted_words;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,340 @@
|
||||
// YMarkTables.java
|
||||
// (C) 2011 by Stefan Förster, sof@gmx.de, Norderstedt, Germany
|
||||
// first published 2010 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate$
|
||||
// $LastChangedRevision$
|
||||
// $LastChangedBy$
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.data.ymark;
|
||||
|
||||
import java.io.IOException;
|
||||
import java.util.EnumSet;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
import java.util.TreeSet;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import de.anomic.data.WorkTables;
|
||||
|
||||
import net.yacy.kelondro.blob.Tables;
|
||||
import net.yacy.kelondro.blob.Tables.Data;
|
||||
import net.yacy.kelondro.index.RowSpaceExceededException;
|
||||
|
||||
public class YMarkTables {
|
||||
|
||||
public static enum TABLES {
|
||||
BOOKMARKS ("_bookmarks"),
|
||||
TAGS ("_tags"),
|
||||
FOLDERS ("_folders");
|
||||
|
||||
private String basename;
|
||||
|
||||
private TABLES(String b) {
|
||||
this.basename = b;
|
||||
}
|
||||
public String basename() {
|
||||
return this.basename;
|
||||
}
|
||||
public String tablename(String bmk_user) {
|
||||
return bmk_user+this.basename;
|
||||
}
|
||||
}
|
||||
|
||||
public static enum PROTOCOLS {
|
||||
HTTP ("http://"),
|
||||
HTTPS ("https://");
|
||||
|
||||
private String protocol;
|
||||
|
||||
private PROTOCOLS(String s) {
|
||||
this.protocol = s;
|
||||
}
|
||||
public String protocol() {
|
||||
return this.protocol;
|
||||
}
|
||||
public String protocol(String s) {
|
||||
return this.protocol+s;
|
||||
}
|
||||
}
|
||||
|
||||
public static enum BOOKMARK {
|
||||
// key dflt html_attrb xbel_attrb type
|
||||
URL ("url", "", "href", "href", "link"),
|
||||
TITLE ("title", "", "", "", "meta"),
|
||||
DESC ("desc", "", "", "", "comment"),
|
||||
DATE_ADDED ("date_added", "", "add_date", "added", "date"),
|
||||
DATE_MODIFIED ("date_modified", "", "last_modified", "modified", "date"),
|
||||
DATE_VISITED ("date_visited", "", "last_visited", "visited", "date"),
|
||||
PUBLIC ("public", "flase", "", "yacy:public", "lock"),
|
||||
TAGS ("tags", "unsorted", "shortcuturl", "yacy:tags", "tag"),
|
||||
VISITS ("visits", "0", "", "yacy:visits", "stat"),
|
||||
FOLDERS ("folders", "/unsorted", "", "", "folder");
|
||||
|
||||
private String key;
|
||||
private String dflt;
|
||||
private String html_attrb;
|
||||
private String xbel_attrb;
|
||||
private String type;
|
||||
|
||||
private static final Map<String,BOOKMARK> lookup = new HashMap<String,BOOKMARK>();
|
||||
static {
|
||||
for(BOOKMARK b : EnumSet.allOf(BOOKMARK.class))
|
||||
lookup.put(b.key(), b);
|
||||
}
|
||||
|
||||
private static StringBuilder buffer = new StringBuilder(25);;
|
||||
|
||||
private BOOKMARK(String k, String s, String a, String x, String t) {
|
||||
this.key = k;
|
||||
this.dflt = s;
|
||||
this.html_attrb = a;
|
||||
this.xbel_attrb = x;
|
||||
this.type = t;
|
||||
}
|
||||
public static BOOKMARK get(String key) {
|
||||
return lookup.get(key);
|
||||
}
|
||||
public static boolean contains(String key) {
|
||||
return lookup.containsKey(key);
|
||||
}
|
||||
public String key() {
|
||||
return this.key;
|
||||
}
|
||||
public String deflt() {
|
||||
return this.dflt;
|
||||
}
|
||||
public String html_attrb() {
|
||||
return this.html_attrb;
|
||||
}
|
||||
public String xbel_attrb() {
|
||||
return this.xbel_attrb;
|
||||
}
|
||||
public String xbel() {
|
||||
buffer.setLength(0);
|
||||
buffer.append('"');
|
||||
buffer.append('\n');
|
||||
buffer.append(' ');
|
||||
buffer.append(this.xbel_attrb);
|
||||
buffer.append('=');
|
||||
buffer.append('"');
|
||||
return buffer.toString();
|
||||
}
|
||||
public String type() {
|
||||
return this.type;
|
||||
}
|
||||
}
|
||||
|
||||
public final static HashMap<String,String> POISON = new HashMap<String,String>();
|
||||
|
||||
public final static String FOLDERS_ROOT = "/";
|
||||
public final static String FOLDERS_UNSORTED = "/unsorted";
|
||||
public final static String FOLDERS_IMPORTED = "/imported";
|
||||
public static final int FOLDER_BUFFER_SIZE = 100;
|
||||
|
||||
public final static String BOOKMARKS_LOG = "BOOKMARKS";
|
||||
public final static String BOOKMARKS_ID = "id";
|
||||
|
||||
public final static String USER_ADMIN = "admin";
|
||||
public final static String USER_AUTHENTICATE = "AUTHENTICATE";
|
||||
public final static String USER_AUTHENTICATE_MSG = "Authentication required!";
|
||||
|
||||
private WorkTables worktables;
|
||||
|
||||
public YMarkTables(final Tables wt) {
|
||||
this.worktables = (WorkTables)wt;
|
||||
}
|
||||
|
||||
public void deleteBookmark(final String bmk_user, final byte[] urlHash) throws IOException, RowSpaceExceededException {
|
||||
final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user);
|
||||
Tables.Row bmk_row = null;
|
||||
bmk_row = this.worktables.select(bmk_table, urlHash);
|
||||
if(bmk_row != null) {
|
||||
this.worktables.delete(bmk_table,urlHash);
|
||||
}
|
||||
}
|
||||
|
||||
public void deleteBookmark(final String bmk_user, final String url) throws IOException, RowSpaceExceededException {
|
||||
this.deleteBookmark(bmk_user, YMarkUtil.getBookmarkId(url));
|
||||
}
|
||||
|
||||
public TreeSet<String> getFolders(final String bmk_user, final String root) throws IOException {
|
||||
final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user);
|
||||
final Pattern r = Pattern.compile("(?:^|.*,)("+root+"/.*)(?:,|$)");
|
||||
final Iterator<Tables.Row> bit = this.worktables.iterator(bmk_table, YMarkTables.BOOKMARK.FOLDERS.key(), r);
|
||||
final TreeSet<String> folders = new TreeSet<String>();
|
||||
final StringBuilder path = new StringBuilder(200);
|
||||
Tables.Row bmk_row = null;
|
||||
while(bit.hasNext()) {
|
||||
bmk_row = bit.next();
|
||||
if(bmk_row.containsKey(BOOKMARK.FOLDERS.key())) {
|
||||
final String[] folderArray = (new String(bmk_row.get(BOOKMARK.FOLDERS.key()),"UTF8")).split(YMarkUtil.TAGS_SEPARATOR);
|
||||
for (final String folder : folderArray) {
|
||||
if(folder.startsWith(root)) {
|
||||
if(!folders.contains(folder)) {
|
||||
path.setLength(0);
|
||||
path.append(folder);
|
||||
//TODO: get rid of .toString.equals()
|
||||
while(path.length() > 0 && !path.toString().equals(root)){
|
||||
folders.add(path.toString());
|
||||
path.setLength(path.lastIndexOf(YMarkUtil.FOLDERS_SEPARATOR));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (!root.equals(YMarkTables.FOLDERS_ROOT)) { folders.add(root); }
|
||||
return folders;
|
||||
}
|
||||
|
||||
public Iterator<Tables.Row> getBookmarksByFolder(final String bmk_user, final String folder) throws IOException {
|
||||
final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user);
|
||||
final StringBuffer buffer = new StringBuffer(folder.length()+30);
|
||||
buffer.append("(?:^|.*,)(\\Q");
|
||||
buffer.append(folder);
|
||||
buffer.append("\\E)(?:,|$)");
|
||||
final Pattern p = Pattern.compile(buffer.toString());
|
||||
return this.worktables.iterator(bmk_table, YMarkTables.BOOKMARK.FOLDERS.key(), p);
|
||||
}
|
||||
|
||||
public Iterator<Tables.Row> getBookmarksByTag(final String bmk_user, final String[] tagArray) throws IOException {
|
||||
// "(?:^|.*,)((?:tag4|tag2|tag5),*.*){3}"
|
||||
final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user);
|
||||
final StringBuffer buffer = new StringBuffer((tagArray.length * 25)+25);
|
||||
buffer.append("(?:^|.*,)((?:");
|
||||
for (final String tag : tagArray) {
|
||||
buffer.append("\\Q");
|
||||
buffer.append(tag);
|
||||
buffer.append("\\E");
|
||||
buffer.append("|");
|
||||
}
|
||||
buffer.deleteCharAt(buffer.length()-1);
|
||||
buffer.append("),*.*){");
|
||||
buffer.append(tagArray.length);
|
||||
buffer.append("}");
|
||||
final Pattern p = Pattern.compile(buffer.toString());
|
||||
return this.worktables.iterator(bmk_table, YMarkTables.BOOKMARK.TAGS.key(), p);
|
||||
}
|
||||
|
||||
public void addBookmark(final String bmk_user, final HashMap<String,String> bmk, final boolean importer) throws IOException, RowSpaceExceededException {
|
||||
final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user);
|
||||
final String date = String.valueOf(System.currentTimeMillis());
|
||||
final byte[] urlHash = YMarkUtil.getBookmarkId(bmk.get(BOOKMARK.URL.key()));
|
||||
Tables.Row bmk_row = null;
|
||||
|
||||
if (urlHash != null) {
|
||||
bmk_row = this.worktables.select(bmk_table, urlHash);
|
||||
if (bmk_row == null) {
|
||||
// create and insert new entry
|
||||
final Data data = new Data();
|
||||
for (BOOKMARK b : BOOKMARK.values()) {
|
||||
switch(b) {
|
||||
case DATE_ADDED:
|
||||
case DATE_MODIFIED:
|
||||
if(bmk.containsKey(b.key()) && bmk.get(b.key()) != null) {
|
||||
data.put(b.key(), bmk.get(b.key()));
|
||||
} else {
|
||||
data.put(b.key(), String.valueOf(System.currentTimeMillis()).getBytes());
|
||||
}
|
||||
break;
|
||||
case TAGS:
|
||||
if(bmk.containsKey(b.key()) && bmk.get(b.key()) != null) {
|
||||
data.put(b.key(), bmk.get(b.key()));
|
||||
} else {
|
||||
data.put(b.key(), b.deflt());
|
||||
}
|
||||
break;
|
||||
case FOLDERS:
|
||||
if(bmk.containsKey(b.key()) && bmk.get(b.key()) != null) {
|
||||
data.put(b.key(), bmk.get(b.key()));
|
||||
} else {
|
||||
data.put(b.key(), b.deflt());
|
||||
}
|
||||
break;
|
||||
default:
|
||||
if(bmk.containsKey(b.key()) && bmk.get(b.key()) != null) {
|
||||
data.put(b.key(), bmk.get(b.key()));
|
||||
}
|
||||
}
|
||||
}
|
||||
this.worktables.insert(bmk_table, urlHash, data);
|
||||
} else {
|
||||
// modify and update existing entry
|
||||
HashSet<String> oldSet;
|
||||
HashSet<String> newSet;
|
||||
for (BOOKMARK b : BOOKMARK.values()) {
|
||||
switch(b) {
|
||||
case DATE_ADDED:
|
||||
if(!bmk_row.containsKey(b.key))
|
||||
bmk_row.put(b.key(), date);
|
||||
break;
|
||||
case DATE_MODIFIED:
|
||||
bmk_row.put(b.key(), date);
|
||||
break;
|
||||
case TAGS:
|
||||
oldSet = YMarkUtil.keysStringToSet(bmk_row.get(b.key(),b.deflt()));
|
||||
if(bmk.containsKey(b.key())) {
|
||||
newSet = YMarkUtil.keysStringToSet(bmk.get(b.key()));
|
||||
if(importer) {
|
||||
newSet.addAll(oldSet);
|
||||
bmk_row.put(b.key(), YMarkUtil.keySetToString(newSet));
|
||||
oldSet.clear();
|
||||
} else {
|
||||
bmk_row.put(b.key, bmk.get(b.key()));
|
||||
}
|
||||
} else {
|
||||
newSet = new HashSet<String>();
|
||||
bmk_row.put(b.key, bmk_row.get(b.key(), b.deflt()));
|
||||
}
|
||||
break;
|
||||
case FOLDERS:
|
||||
oldSet = YMarkUtil.keysStringToSet(bmk_row.get(b.key(),b.deflt()));
|
||||
if(bmk.containsKey(b.key())) {
|
||||
newSet = YMarkUtil.keysStringToSet(bmk.get(b.key()));
|
||||
if(importer) {
|
||||
newSet.addAll(oldSet);
|
||||
bmk_row.put(b.key(), YMarkUtil.keySetToString(newSet));
|
||||
oldSet.clear();
|
||||
} else {
|
||||
bmk_row.put(b.key, bmk.get(b.key()));
|
||||
}
|
||||
} else {
|
||||
newSet = new HashSet<String>();
|
||||
bmk_row.put(b.key, bmk_row.get(b.key(), b.deflt()));
|
||||
}
|
||||
break;
|
||||
default:
|
||||
if(bmk.containsKey(b.key())) {
|
||||
bmk_row.put(b.key, bmk.get(b.key()));
|
||||
} else {
|
||||
bmk_row.put(b.key, bmk_row.get(b.key(), b.deflt()));
|
||||
}
|
||||
}
|
||||
}
|
||||
// update bmk_table
|
||||
this.worktables.update(bmk_table, bmk_row);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,114 @@
|
||||
// YMarkUtil.java
|
||||
// (C) 2011 by Stefan Förster, sof@gmx.de, Norderstedt, Germany
|
||||
// first published 2010 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2011-03-09 13:50:39 +0100 (Mi, 09 Mrz 2011) $
|
||||
// $LastChangedRevision: 7574 $
|
||||
// $LastChangedBy: apfelmaennchen $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.data.ymark;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.util.HashSet;
|
||||
import java.util.Iterator;
|
||||
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
|
||||
public class YMarkUtil {
|
||||
public final static String TAGS_SEPARATOR = ",";
|
||||
public final static String FOLDERS_SEPARATOR = "/";
|
||||
|
||||
public final static byte[] getBookmarkId(String url) throws MalformedURLException {
|
||||
return (new DigestURI(url, null)).hash();
|
||||
}
|
||||
|
||||
public final static byte[] getKeyId(final String tag) {
|
||||
return Word.word2hash(tag.toLowerCase());
|
||||
}
|
||||
|
||||
public final static byte[] keySetToBytes(final HashSet<String> urlSet) {
|
||||
return keySetToString(urlSet).getBytes();
|
||||
}
|
||||
|
||||
public final static String keySetToString(final HashSet<String> urlSet) {
|
||||
final Iterator<String> urlIter = urlSet.iterator();
|
||||
final
|
||||
StringBuilder urls = new StringBuilder(urlSet.size()*20);
|
||||
while(urlIter.hasNext()) {
|
||||
urls.append(TAGS_SEPARATOR);
|
||||
urls.append(urlIter.next());
|
||||
}
|
||||
urls.deleteCharAt(0);
|
||||
return urls.toString();
|
||||
}
|
||||
|
||||
public final static HashSet<String> keysStringToSet(final String keysString) {
|
||||
HashSet<String> keySet = new HashSet<String>();
|
||||
final String[] keyArray = keysString.split(TAGS_SEPARATOR);
|
||||
for (final String key : keyArray) {
|
||||
keySet.add(key);
|
||||
}
|
||||
return keySet;
|
||||
}
|
||||
|
||||
public final static String cleanTagsString(final String tagsString) {
|
||||
StringBuilder ts = new StringBuilder(tagsString);
|
||||
if(ts.length() == 0)
|
||||
return YMarkTables.BOOKMARK.TAGS.deflt();
|
||||
// get rid of double commas and space characters following a comma
|
||||
for (int i = 0; i < ts.length()-1; i++) {
|
||||
if (ts.charAt(i) == TAGS_SEPARATOR.charAt(0)) {
|
||||
if (ts.charAt(i+1) == TAGS_SEPARATOR.charAt(0) || ts.charAt(i+1) == ' ') {
|
||||
ts.deleteCharAt(i+1);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
}
|
||||
// get rid of heading and trailing comma
|
||||
if (ts.charAt(0) == TAGS_SEPARATOR.charAt(0))
|
||||
ts.deleteCharAt(0);
|
||||
if (ts.charAt(ts.length()-1) == TAGS_SEPARATOR.charAt(0))
|
||||
ts.deleteCharAt(ts.length()-1);
|
||||
return ts.toString();
|
||||
}
|
||||
|
||||
public final static String cleanFoldersString(final String foldersString) {
|
||||
StringBuilder fs = new StringBuilder(cleanTagsString(foldersString));
|
||||
if(fs.length() == 0)
|
||||
return YMarkTables.BOOKMARK.FOLDERS.deflt();
|
||||
for (int i = 0; i < fs.length()-1; i++) {
|
||||
if (fs.charAt(i) == FOLDERS_SEPARATOR.charAt(0)) {
|
||||
if (fs.charAt(i+1) == TAGS_SEPARATOR.charAt(0) || fs.charAt(i+1) == FOLDERS_SEPARATOR.charAt(0)) {
|
||||
fs.deleteCharAt(i);
|
||||
i--;
|
||||
} else if (fs.charAt(i+1) == ' ') {
|
||||
fs.deleteCharAt(i+1);
|
||||
i--;
|
||||
}
|
||||
}
|
||||
}
|
||||
if (fs.charAt(fs.length()-1) == FOLDERS_SEPARATOR.charAt(0)) {
|
||||
fs.deleteCharAt(fs.length()-1);
|
||||
}
|
||||
return fs.toString();
|
||||
}
|
||||
}
|
@ -0,0 +1,53 @@
|
||||
// YMarkWordCountComparator.java
|
||||
// (C) 2011 by Stefan Förster, sof@gmx.de, Norderstedt, Germany
|
||||
// first published 2010 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate$
|
||||
// $LastChangedRevision$
|
||||
// $LastChangedBy$
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.data.ymark;
|
||||
|
||||
import java.util.Comparator;
|
||||
import java.util.Map;
|
||||
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
|
||||
public class YMarkWordCountComparator implements Comparator<String> {
|
||||
|
||||
private Map<String,Word> words;
|
||||
|
||||
public YMarkWordCountComparator(final Map<String,Word> words) {
|
||||
this.words = words;
|
||||
}
|
||||
|
||||
public int compare(final String k1, final String k2) {
|
||||
final Word w1 = this.words.get(k1);
|
||||
final Word w2 = this.words.get(k2);
|
||||
|
||||
if(w1.occurrences() > w2.occurrences())
|
||||
return 1;
|
||||
else if(w1.occurrences() < w2.occurrences())
|
||||
return -1;
|
||||
else
|
||||
return 0;
|
||||
}
|
||||
}
|
Loading…
Reference in new issue