You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
521 lines
19 KiB
521 lines
19 KiB
package de.anomic.data;
|
|
|
|
import java.io.IOException;
|
|
import java.io.ByteArrayInputStream;
|
|
import java.io.UnsupportedEncodingException;
|
|
import java.net.MalformedURLException;
|
|
import java.text.ParseException;
|
|
import java.text.SimpleDateFormat;
|
|
import java.util.ArrayList;
|
|
import java.util.Date;
|
|
import java.util.EnumMap;
|
|
import java.util.EnumSet;
|
|
import java.util.Enumeration;
|
|
import java.util.HashMap;
|
|
import java.util.HashSet;
|
|
import java.util.Iterator;
|
|
import java.util.Map;
|
|
import java.util.TreeMap;
|
|
|
|
import net.yacy.cora.date.ISO8601Formatter;
|
|
import net.yacy.document.Condenser;
|
|
import net.yacy.document.Document;
|
|
import net.yacy.document.LibraryProvider;
|
|
import net.yacy.document.WordTokenizer;
|
|
import net.yacy.kelondro.blob.Tables;
|
|
import net.yacy.kelondro.blob.Tables.Data;
|
|
import net.yacy.kelondro.data.meta.DigestURI;
|
|
import net.yacy.kelondro.data.meta.URIMetadataRow;
|
|
import net.yacy.kelondro.data.word.Word;
|
|
import net.yacy.kelondro.index.RowSpaceExceededException;
|
|
import net.yacy.kelondro.logging.Log;
|
|
import de.anomic.search.Segment;
|
|
|
|
public class YMarkTables {
|
|
|
|
public static enum TABLES {
|
|
BOOKMARKS ("_bookmarks"),
|
|
TAGS ("_tags"),
|
|
FOLDERS ("_folders");
|
|
|
|
private String basename;
|
|
|
|
private TABLES(String b) {
|
|
this.basename = b;
|
|
}
|
|
public String basename() {
|
|
return this.basename;
|
|
}
|
|
public String tablename(String bmk_user) {
|
|
return bmk_user+this.basename;
|
|
}
|
|
}
|
|
|
|
public static enum PROTOCOLS {
|
|
HTTP ("http://"),
|
|
HTTPS ("https://");
|
|
|
|
private String protocol;
|
|
|
|
private PROTOCOLS(String s) {
|
|
this.protocol = s;
|
|
}
|
|
public String protocol() {
|
|
return this.protocol;
|
|
}
|
|
public String protocol(String s) {
|
|
return this.protocol+s;
|
|
}
|
|
}
|
|
|
|
public static enum BOOKMARK {
|
|
// key dflt html_attrb xbel_attrb type
|
|
URL ("url", "", "href", "href", "link"),
|
|
TITLE ("title", "", "", "", "meta"),
|
|
DESC ("desc", "", "", "", "comment"),
|
|
DATE_ADDED ("date_added", "", "add_date", "added", "date"),
|
|
DATE_MODIFIED ("date_modified", "", "last_modified", "modified", "date"),
|
|
DATE_VISITED ("date_visited", "", "last_visited", "visited", "date"),
|
|
PUBLIC ("public", "flase", "", "yacy:public", "lock"),
|
|
TAGS ("tags", "unsorted", "shortcuturl", "yacy:tags", "tag"),
|
|
VISITS ("visits", "0", "", "yacy:visits", "stat"),
|
|
FOLDERS ("folders", "/unsorted", "", "", "folder");
|
|
|
|
private String key;
|
|
private String dflt;
|
|
private String html_attrb;
|
|
private String xbel_attrb;
|
|
private String type;
|
|
|
|
private static final Map<String,BOOKMARK> lookup = new HashMap<String,BOOKMARK>();
|
|
static {
|
|
for(BOOKMARK b : EnumSet.allOf(BOOKMARK.class))
|
|
lookup.put(b.key(), b);
|
|
}
|
|
|
|
private static StringBuilder buffer = new StringBuilder(25);;
|
|
|
|
private BOOKMARK(String k, String s, String a, String x, String t) {
|
|
this.key = k;
|
|
this.dflt = s;
|
|
this.html_attrb = a;
|
|
this.xbel_attrb = x;
|
|
this.type = t;
|
|
}
|
|
public static BOOKMARK get(String key) {
|
|
return lookup.get(key);
|
|
}
|
|
public static boolean contains(String key) {
|
|
return lookup.containsKey(key);
|
|
}
|
|
public String key() {
|
|
return this.key;
|
|
}
|
|
public String deflt() {
|
|
return this.dflt;
|
|
}
|
|
public String html_attrb() {
|
|
return this.html_attrb;
|
|
}
|
|
public String xbel_attrb() {
|
|
return this.xbel_attrb;
|
|
}
|
|
public String xbel() {
|
|
buffer.setLength(0);
|
|
buffer.append('"');
|
|
buffer.append('\n');
|
|
buffer.append(' ');
|
|
buffer.append(this.xbel_attrb);
|
|
buffer.append('=');
|
|
buffer.append('"');
|
|
return buffer.toString();
|
|
}
|
|
public String type() {
|
|
return this.type;
|
|
}
|
|
}
|
|
|
|
public enum METADATA {
|
|
TITLE,
|
|
DESCRIPTION,
|
|
FAVICON,
|
|
KEYWORDS,
|
|
LANGUAGE,
|
|
CREATOR,
|
|
PUBLISHER,
|
|
CHARSET,
|
|
MIMETYPE,
|
|
SIZE,
|
|
WORDCOUNT,
|
|
IN_URLDB,
|
|
FRESHDATE,
|
|
LOADDATE,
|
|
MODDATE,
|
|
SNIPPET
|
|
}
|
|
|
|
public final static HashMap<String,String> POISON = new HashMap<String,String>();
|
|
|
|
public final static String TAGS_SEPARATOR = ",";
|
|
|
|
public final static String FOLDERS_SEPARATOR = "/";
|
|
public final static String FOLDERS_ROOT = "/";
|
|
public final static String FOLDERS_UNSORTED = "/unsorted";
|
|
public final static String FOLDERS_IMPORTED = "/imported";
|
|
public static final int FOLDER_BUFFER_SIZE = 100;
|
|
|
|
public final static String BOOKMARKS_LOG = "BOOKMARKS";
|
|
public final static String BOOKMARKS_ID = "id";
|
|
|
|
public final static String USER_ADMIN = "admin";
|
|
public final static String USER_AUTHENTICATE = "AUTHENTICATE";
|
|
public final static String USER_AUTHENTICATE_MSG = "Authentication required!";
|
|
|
|
private WorkTables worktables;
|
|
public YMarkIndex tags;
|
|
public YMarkIndex folders;
|
|
|
|
public YMarkTables(final Tables wt) {
|
|
this.worktables = (WorkTables)wt;
|
|
this.folders = new YMarkIndex(this.worktables, TABLES.FOLDERS.basename());
|
|
this.tags = new YMarkIndex(this.worktables, TABLES.TAGS.basename());
|
|
}
|
|
|
|
public static Date parseISO8601(final String s) throws ParseException {
|
|
if(s == null || s.length() < 1) {
|
|
throw new ParseException("parseISO8601 - empty string, nothing to parse", 0);
|
|
}
|
|
SimpleDateFormat dateformat;
|
|
StringBuilder date = new StringBuilder(s);
|
|
if(s.length()==10)
|
|
dateformat = new SimpleDateFormat("yyyy-MM-dd");
|
|
else {
|
|
dateformat = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ssz");
|
|
if(date.charAt(date.length()-1) == 'Z') {
|
|
date.deleteCharAt(date.length()-1);
|
|
date.append("GMT-00:00");
|
|
} else {
|
|
date.insert(date.length()-6, "GMT");
|
|
}
|
|
}
|
|
return dateformat.parse(date.toString());
|
|
}
|
|
|
|
public static String getISO8601(final byte[] date) {
|
|
if(date != null) {
|
|
final String s = new String(date);
|
|
if(s != null && s.length() > 0)
|
|
return ISO8601Formatter.FORMATTER.format(new Date(Long.parseLong(s)));
|
|
}
|
|
return "";
|
|
}
|
|
|
|
public final static byte[] getBookmarkId(String url) throws MalformedURLException {
|
|
return (new DigestURI(url, null)).hash();
|
|
}
|
|
|
|
public final static byte[] getKeyId(final String tag) {
|
|
return Word.word2hash(tag.toLowerCase());
|
|
}
|
|
|
|
public final static byte[] keySetToBytes(final HashSet<String> urlSet) {
|
|
return keySetToString(urlSet).getBytes();
|
|
}
|
|
|
|
public final static String keySetToString(final HashSet<String> urlSet) {
|
|
final Iterator<String> urlIter = urlSet.iterator();
|
|
final
|
|
StringBuilder urls = new StringBuilder(urlSet.size()*20);
|
|
while(urlIter.hasNext()) {
|
|
urls.append(TAGS_SEPARATOR);
|
|
urls.append(urlIter.next());
|
|
}
|
|
urls.deleteCharAt(0);
|
|
return urls.toString();
|
|
}
|
|
|
|
public final static HashSet<String> keysStringToSet(final String keysString) {
|
|
HashSet<String> keySet = new HashSet<String>();
|
|
final String[] keyArray = keysString.split(TAGS_SEPARATOR);
|
|
for (final String key : keyArray) {
|
|
keySet.add(key);
|
|
}
|
|
return keySet;
|
|
}
|
|
|
|
public final static String cleanTagsString(final String tagsString) {
|
|
StringBuilder ts = new StringBuilder(tagsString);
|
|
if(ts.length() == 0)
|
|
return YMarkTables.BOOKMARK.TAGS.deflt();
|
|
// get rid of double commas and space characters following a comma
|
|
for (int i = 0; i < ts.length()-1; i++) {
|
|
if (ts.charAt(i) == TAGS_SEPARATOR.charAt(0)) {
|
|
if (ts.charAt(i+1) == TAGS_SEPARATOR.charAt(0) || ts.charAt(i+1) == ' ') {
|
|
ts.deleteCharAt(i+1);
|
|
i--;
|
|
}
|
|
}
|
|
}
|
|
// get rid of heading and trailing comma
|
|
if (ts.charAt(0) == TAGS_SEPARATOR.charAt(0))
|
|
ts.deleteCharAt(0);
|
|
if (ts.charAt(ts.length()-1) == TAGS_SEPARATOR.charAt(0))
|
|
ts.deleteCharAt(ts.length()-1);
|
|
return ts.toString();
|
|
}
|
|
|
|
public final static String cleanFoldersString(final String foldersString) {
|
|
StringBuilder fs = new StringBuilder(cleanTagsString(foldersString));
|
|
if(fs.length() == 0)
|
|
return YMarkTables.BOOKMARK.FOLDERS.deflt();
|
|
for (int i = 0; i < fs.length()-1; i++) {
|
|
if (fs.charAt(i) == FOLDERS_SEPARATOR.charAt(0)) {
|
|
if (fs.charAt(i+1) == TAGS_SEPARATOR.charAt(0) || fs.charAt(i+1) == FOLDERS_SEPARATOR.charAt(0)) {
|
|
fs.deleteCharAt(i);
|
|
i--;
|
|
} else if (fs.charAt(i+1) == ' ') {
|
|
fs.deleteCharAt(i+1);
|
|
i--;
|
|
}
|
|
}
|
|
}
|
|
if (fs.charAt(fs.length()-1) == FOLDERS_SEPARATOR.charAt(0)) {
|
|
fs.deleteCharAt(fs.length()-1);
|
|
}
|
|
return fs.toString();
|
|
}
|
|
|
|
public void clearIndex(String tablename) {
|
|
if (tablename.endsWith(TABLES.TAGS.basename()))
|
|
this.tags.clearCache();
|
|
if (tablename.endsWith(TABLES.FOLDERS.basename()))
|
|
this.folders.clearCache();
|
|
}
|
|
|
|
public void deleteBookmark(final String bmk_user, final byte[] urlHash) throws IOException, RowSpaceExceededException {
|
|
final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user);
|
|
Tables.Row bmk_row = null;
|
|
bmk_row = this.worktables.select(bmk_table, urlHash);
|
|
if(bmk_row != null) {
|
|
final String tagsString = bmk_row.get(YMarkTables.BOOKMARK.TAGS.key(),YMarkTables.BOOKMARK.TAGS.deflt());
|
|
tags.removeIndexEntry(bmk_user, tagsString, urlHash);
|
|
final String foldersString = bmk_row.get(YMarkTables.BOOKMARK.FOLDERS.key(),YMarkTables.FOLDERS_ROOT);
|
|
folders.removeIndexEntry(bmk_user, foldersString, urlHash);
|
|
this.worktables.delete(bmk_table,urlHash);
|
|
}
|
|
}
|
|
|
|
public void deleteBookmark(final String bmk_user, final String url) throws IOException, RowSpaceExceededException {
|
|
this.deleteBookmark(bmk_user, getBookmarkId(url));
|
|
}
|
|
|
|
public void addBookmark(final String bmk_user, final HashMap<String,String> bmk, final boolean importer) throws IOException, RowSpaceExceededException {
|
|
final String bmk_table = TABLES.BOOKMARKS.tablename(bmk_user);
|
|
final String date = String.valueOf(System.currentTimeMillis());
|
|
final byte[] urlHash = getBookmarkId(bmk.get(BOOKMARK.URL.key()));
|
|
Tables.Row bmk_row = null;
|
|
|
|
if (urlHash != null) {
|
|
bmk_row = this.worktables.select(bmk_table, urlHash);
|
|
if (bmk_row == null) {
|
|
// create and insert new entry
|
|
final Data data = new Data();
|
|
for (BOOKMARK b : BOOKMARK.values()) {
|
|
switch(b) {
|
|
case DATE_ADDED:
|
|
case DATE_MODIFIED:
|
|
if(bmk.containsKey(b.key()) && bmk.get(b.key()) != null) {
|
|
data.put(b.key(), bmk.get(b.key()));
|
|
} else {
|
|
data.put(b.key(), String.valueOf(System.currentTimeMillis()).getBytes());
|
|
}
|
|
break;
|
|
case TAGS:
|
|
if(bmk.containsKey(b.key()) && bmk.get(b.key()) != null) {
|
|
this.tags.insertIndexEntry(bmk_user, bmk.get(b.key()), urlHash);
|
|
data.put(b.key(), bmk.get(b.key()));
|
|
} else {
|
|
this.tags.insertIndexEntry(bmk_user, b.deflt(), urlHash);
|
|
data.put(b.key(), b.deflt());
|
|
}
|
|
break;
|
|
case FOLDERS:
|
|
if(bmk.containsKey(b.key()) && bmk.get(b.key()) != null) {
|
|
this.folders.insertIndexEntry(bmk_user, bmk.get(b.key()), urlHash);
|
|
data.put(b.key(), bmk.get(b.key()));
|
|
} else {
|
|
this.folders.insertIndexEntry(bmk_user, b.deflt(), urlHash);
|
|
data.put(b.key(), b.deflt());
|
|
}
|
|
break;
|
|
default:
|
|
if(bmk.containsKey(b.key()) && bmk.get(b.key()) != null) {
|
|
data.put(b.key(), bmk.get(b.key()));
|
|
}
|
|
}
|
|
}
|
|
this.worktables.insert(bmk_table, urlHash, data);
|
|
} else {
|
|
// modify and update existing entry
|
|
HashSet<String> oldSet;
|
|
HashSet<String> newSet;
|
|
for (BOOKMARK b : BOOKMARK.values()) {
|
|
switch(b) {
|
|
case DATE_ADDED:
|
|
if(!bmk_row.containsKey(b.key))
|
|
bmk_row.put(b.key(), date);
|
|
break;
|
|
case DATE_MODIFIED:
|
|
bmk_row.put(b.key(), date);
|
|
break;
|
|
case TAGS:
|
|
oldSet = keysStringToSet(bmk_row.get(b.key(),b.deflt()));
|
|
if(bmk.containsKey(b.key())) {
|
|
newSet = keysStringToSet(bmk.get(b.key()));
|
|
if(importer) {
|
|
newSet.addAll(oldSet);
|
|
bmk_row.put(b.key(), keySetToString(newSet));
|
|
oldSet.clear();
|
|
} else {
|
|
bmk_row.put(b.key, bmk.get(b.key()));
|
|
}
|
|
} else {
|
|
newSet = new HashSet<String>();
|
|
bmk_row.put(b.key, bmk_row.get(b.key(), b.deflt()));
|
|
}
|
|
this.tags.updateIndexEntry(bmk_user, urlHash, oldSet, newSet);
|
|
break;
|
|
case FOLDERS:
|
|
oldSet = keysStringToSet(bmk_row.get(b.key(),b.deflt()));
|
|
if(bmk.containsKey(b.key())) {
|
|
newSet = keysStringToSet(bmk.get(b.key()));
|
|
if(importer) {
|
|
newSet.addAll(oldSet);
|
|
bmk_row.put(b.key(), keySetToString(newSet));
|
|
oldSet.clear();
|
|
} else {
|
|
bmk_row.put(b.key, bmk.get(b.key()));
|
|
}
|
|
} else {
|
|
newSet = new HashSet<String>();
|
|
bmk_row.put(b.key, bmk_row.get(b.key(), b.deflt()));
|
|
}
|
|
this.folders.updateIndexEntry(bmk_user, urlHash, oldSet, newSet);
|
|
break;
|
|
default:
|
|
if(bmk.containsKey(b.key())) {
|
|
bmk_row.put(b.key, bmk.get(b.key()));
|
|
} else {
|
|
bmk_row.put(b.key, bmk_row.get(b.key(), b.deflt()));
|
|
}
|
|
}
|
|
}
|
|
// update bmk_table
|
|
this.worktables.update(bmk_table, bmk_row);
|
|
}
|
|
}
|
|
}
|
|
|
|
public static EnumMap<METADATA, String> getMetadata(final byte[] urlHash, final Segment indexSegment) {
|
|
final EnumMap<METADATA, String> metadata = new EnumMap<METADATA, String>(METADATA.class);
|
|
final URIMetadataRow urlEntry = indexSegment.urlMetadata().load(urlHash, null, 0);
|
|
if (urlEntry != null) {
|
|
metadata.put(METADATA.IN_URLDB, "true");
|
|
metadata.put(METADATA.SIZE, String.valueOf(urlEntry.size()));
|
|
metadata.put(METADATA.FRESHDATE, ISO8601Formatter.FORMATTER.format(urlEntry.freshdate()));
|
|
metadata.put(METADATA.LOADDATE, ISO8601Formatter.FORMATTER.format(urlEntry.loaddate()));
|
|
metadata.put(METADATA.MODDATE, ISO8601Formatter.FORMATTER.format(urlEntry.moddate()));
|
|
metadata.put(METADATA.SNIPPET, String.valueOf(urlEntry.snippet()));
|
|
metadata.put(METADATA.WORDCOUNT, String.valueOf(urlEntry.wordCount()));
|
|
metadata.put(METADATA.MIMETYPE, String.valueOf(urlEntry.doctype()));
|
|
metadata.put(METADATA.LANGUAGE, urlEntry.language());
|
|
|
|
final URIMetadataRow.Components meta = urlEntry.metadata();
|
|
if (meta != null) {
|
|
metadata.put(METADATA.TITLE, meta.dc_title());
|
|
metadata.put(METADATA.CREATOR, meta.dc_creator());
|
|
metadata.put(METADATA.KEYWORDS, meta.dc_subject());
|
|
metadata.put(METADATA.PUBLISHER, meta.dc_publisher());
|
|
}
|
|
}
|
|
return metadata;
|
|
}
|
|
|
|
public static EnumMap<METADATA, String> getMetadata(final Document document) {
|
|
final EnumMap<METADATA, String> metadata = new EnumMap<METADATA, String>(METADATA.class);
|
|
metadata.put(METADATA.IN_URLDB, "false");
|
|
if(document != null) {
|
|
metadata.put(METADATA.TITLE, document.dc_title());
|
|
metadata.put(METADATA.CREATOR, document.dc_creator());
|
|
metadata.put(METADATA.KEYWORDS, document.dc_subject(' '));
|
|
metadata.put(METADATA.PUBLISHER, document.dc_publisher());
|
|
metadata.put(METADATA.DESCRIPTION, document.dc_description());
|
|
metadata.put(METADATA.MIMETYPE, document.dc_format());
|
|
metadata.put(METADATA.LANGUAGE, document.dc_language());
|
|
metadata.put(METADATA.CHARSET, document.getCharset());
|
|
// metadata.put(METADATA.SIZE, String.valueOf(document.getTextLength()));
|
|
}
|
|
return metadata;
|
|
}
|
|
|
|
public String autoTag(final Document document, final String bmk_user, final int count) {
|
|
final StringBuilder buffer = new StringBuilder();
|
|
final Map<String, Word> words;
|
|
if(document != null) {
|
|
try {
|
|
words = new Condenser(document, true, true, LibraryProvider.dymLib).words();
|
|
buffer.append(document.dc_title());
|
|
buffer.append(document.dc_description());
|
|
buffer.append(document.dc_subject(' '));
|
|
final Enumeration<String> tokens = new WordTokenizer(new ByteArrayInputStream(buffer.toString().getBytes("UTF-8")), LibraryProvider.dymLib);
|
|
while(tokens.hasMoreElements()) {
|
|
int max = 1;
|
|
String token = tokens.nextElement();
|
|
Word word = words.get(token);
|
|
if (words.containsKey(token)) {
|
|
if (this.worktables.has(TABLES.TAGS.tablename(bmk_user), getKeyId(token))) {
|
|
max = word.occurrences() * 1000;
|
|
} else if (token.length()>3) {
|
|
max = word.occurrences() * 100;
|
|
}
|
|
for(int i=0; i<max; i++) {
|
|
word.inc();
|
|
}
|
|
}
|
|
}
|
|
buffer.setLength(0);
|
|
final ArrayList<String> topwords = new ArrayList<String>(sortWordCounts(words).descendingKeySet());
|
|
for(int i=0; i<count && i<topwords.size() ; i++) {
|
|
if(words.get(topwords.get(i)).occurrences() > 100) {
|
|
buffer.append(topwords.get(i));
|
|
buffer.append(YMarkTables.TAGS_SEPARATOR);
|
|
}
|
|
}
|
|
} catch (UnsupportedEncodingException e) {
|
|
Log.logException(e);
|
|
} catch (IOException e) {
|
|
Log.logException(e);
|
|
}
|
|
}
|
|
return YMarkTables.cleanTagsString(buffer.toString());
|
|
}
|
|
|
|
public static TreeMap<String,Word> getWordCounts(final Document document) {
|
|
try {
|
|
if(document != null) {
|
|
return sortWordCounts(new Condenser(document, true, true, LibraryProvider.dymLib).words());
|
|
}
|
|
} catch (IOException e) {
|
|
Log.logException(e);
|
|
}
|
|
return new TreeMap<String, Word>();
|
|
}
|
|
|
|
public static TreeMap<String,Word> sortWordCounts(final Map<String, Word> unsorted_words) {
|
|
final TreeMap<String, Word> sorted_words = new TreeMap<String, Word>(new YMarkWordCountComparator(unsorted_words));
|
|
sorted_words.putAll(unsorted_words);
|
|
return sorted_words;
|
|
}
|
|
|
|
}
|