refactoring of bookmarks: there is a big performance problem in the bookmarks code and furthermore the bookmarks
will loose its leading role for the re-crawl funtion when the new api tables will work. To be prepared for a replacement of such functions the bookmark class is re-organised. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6637 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
3751ab4ae2
commit
ada0ce9de3
@ -0,0 +1,165 @@
|
||||
// BookmarkHelper.java
|
||||
// -------------------------------------
|
||||
// part of YACY
|
||||
// (C) by Michael Peter Christen; mc@yacy.net
|
||||
// first published on http://www.anomic.de
|
||||
// Frankfurt, Germany, 2004
|
||||
//
|
||||
// Methods from this file has been originally contributed by Alexander Schier
|
||||
// and had been refactored by Michael Christen for better a method structure 30.01.2010
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
|
||||
package de.anomic.data;
|
||||
|
||||
import java.io.File;
|
||||
import java.io.IOException;
|
||||
import java.util.ArrayList;
|
||||
import java.util.HashMap;
|
||||
import java.util.Iterator;
|
||||
import java.util.Map;
|
||||
|
||||
import de.anomic.data.bookmarksDB.Bookmark;
|
||||
|
||||
import net.yacy.kelondro.blob.MapHeap;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.order.NaturalOrder;
|
||||
|
||||
public class BookmarkDate {
|
||||
|
||||
MapHeap datesTable;
|
||||
|
||||
public BookmarkDate(File datesFile) throws IOException {
|
||||
this.datesTable = new MapHeap(datesFile, 20, NaturalOrder.naturalOrder, 1024 * 64, 500, '_');
|
||||
}
|
||||
|
||||
public void close() {
|
||||
this.datesTable.close();
|
||||
}
|
||||
|
||||
|
||||
public Entry getDate(final String date) {
|
||||
Map<String, String> map;
|
||||
try {
|
||||
map = datesTable.get(date);
|
||||
} catch (final IOException e) {
|
||||
map = null;
|
||||
}
|
||||
if (map==null) return new Entry(date);
|
||||
return new Entry(date, map);
|
||||
}
|
||||
|
||||
// rebuilds the datesDB from the bookmarksDB
|
||||
public void init(Iterator<Bookmark> it) {
|
||||
Log.logInfo("BOOKMARKS", "start init dates.db from bookmarks.db...");
|
||||
//final Iterator<Bookmark> it=bookmarkIterator(true);
|
||||
Bookmark bookmark;
|
||||
String date;
|
||||
Entry bmDate;
|
||||
int count = 0;
|
||||
while(it.hasNext()){
|
||||
bookmark=it.next();
|
||||
date = String.valueOf(bookmark.getTimeStamp());
|
||||
bmDate=getDate(date);
|
||||
if(bmDate==null){
|
||||
bmDate=new Entry(date);
|
||||
}
|
||||
bmDate.add(bookmark.getUrlHash());
|
||||
bmDate.setDatesTable();
|
||||
count++;
|
||||
}
|
||||
Log.logInfo("BOOKMARKS", "finished init "+datesTable.size()+" dates using " + count + " bookmarks.");
|
||||
}
|
||||
|
||||
/**
|
||||
* Subclass of bookmarksDB, which provide the bookmarksDate object-type
|
||||
*/
|
||||
public class Entry {
|
||||
public static final String URL_HASHES="urlHashes";
|
||||
private final Map<String, String> mem;
|
||||
String date;
|
||||
|
||||
public Entry(final String mydate){
|
||||
//round to seconds, but store as milliseconds (java timestamp)
|
||||
date=String.valueOf((Long.parseLong(mydate)/1000)*1000);
|
||||
mem=new HashMap<String, String>();
|
||||
mem.put(URL_HASHES, "");
|
||||
}
|
||||
|
||||
public Entry(final String mydate, final Map<String, String> map){
|
||||
//round to seconds, but store as milliseconds (java timestamp)
|
||||
date=String.valueOf((Long.parseLong(mydate)/1000)*1000);
|
||||
mem=map;
|
||||
}
|
||||
public Entry(final String mydate, final ArrayList<String> entries){
|
||||
//round to seconds, but store as milliseconds (java timestamp)
|
||||
date=String.valueOf((Long.parseLong(mydate)/1000)*1000);
|
||||
mem=new HashMap<String, String>();
|
||||
mem.put(URL_HASHES, listManager.collection2string(entries));
|
||||
}
|
||||
public void add(final String urlHash){
|
||||
final String urlHashes = mem.get(URL_HASHES);
|
||||
ArrayList<String> list;
|
||||
if(urlHashes != null && !urlHashes.equals("")){
|
||||
list=listManager.string2arraylist(urlHashes);
|
||||
}else{
|
||||
list=new ArrayList<String>();
|
||||
}
|
||||
if(!list.contains(urlHash) && urlHash != null && !urlHash.equals("")){
|
||||
list.add(urlHash);
|
||||
}
|
||||
this.mem.put(URL_HASHES, listManager.collection2string(list));
|
||||
/*if(urlHashes!=null && !urlHashes.equals("") ){
|
||||
if(urlHashes.indexOf(urlHash) <0){
|
||||
this.mem.put(URL_HASHES, urlHashes+","+urlHash);
|
||||
}
|
||||
}else{
|
||||
this.mem.put(URL_HASHES, urlHash);
|
||||
}*/
|
||||
}
|
||||
public void delete(final String urlHash){
|
||||
final ArrayList<String> list=listManager.string2arraylist(this.mem.get(URL_HASHES));
|
||||
if(list.contains(urlHash)){
|
||||
list.remove(urlHash);
|
||||
}
|
||||
this.mem.put(URL_HASHES, listManager.collection2string(list));
|
||||
}
|
||||
public void setDatesTable() {
|
||||
if (this.size() >0) {
|
||||
try {
|
||||
datesTable.put(getDateString(), mem);
|
||||
} catch (Exception e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
} else {
|
||||
try {
|
||||
datesTable.remove(getDateString());
|
||||
} catch (IOException e) {
|
||||
Log.logException(e);
|
||||
}
|
||||
}
|
||||
}
|
||||
public String getDateString(){
|
||||
return date;
|
||||
}
|
||||
public ArrayList<String> getBookmarkList(){
|
||||
return listManager.string2arraylist(this.mem.get(URL_HASHES));
|
||||
}
|
||||
public int size(){
|
||||
return listManager.string2arraylist(this.mem.get(URL_HASHES)).size();
|
||||
}
|
||||
}
|
||||
}
|
@ -0,0 +1,255 @@
|
||||
// BookmarkHelper.java
|
||||
// -------------------------------------
|
||||
// part of YACY
|
||||
// (C) by Michael Peter Christen; mc@yacy.net
|
||||
// first published on http://www.anomic.de
|
||||
// Frankfurt, Germany, 2004
|
||||
//
|
||||
// Methods from this file has been originally contributed by Alexander Schier
|
||||
// and had been refactored by Michael Christen for better a method structure 30.01.2010
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package de.anomic.data;
|
||||
|
||||
import java.io.ByteArrayInputStream;
|
||||
import java.io.IOException;
|
||||
import java.io.InputStream;
|
||||
import java.io.InputStreamReader;
|
||||
import java.io.UnsupportedEncodingException;
|
||||
import java.io.Writer;
|
||||
import java.text.ParseException;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.HashSet;
|
||||
import java.util.Map;
|
||||
import java.util.Set;
|
||||
import java.util.Map.Entry;
|
||||
|
||||
import javax.xml.parsers.DocumentBuilder;
|
||||
import javax.xml.parsers.DocumentBuilderFactory;
|
||||
import javax.xml.parsers.ParserConfigurationException;
|
||||
|
||||
import org.w3c.dom.Document;
|
||||
import org.w3c.dom.NamedNodeMap;
|
||||
import org.w3c.dom.Node;
|
||||
import org.w3c.dom.NodeList;
|
||||
import org.xml.sax.SAXException;
|
||||
|
||||
import de.anomic.data.bookmarksDB.Bookmark;
|
||||
import net.yacy.document.parser.html.ContentScraper;
|
||||
import net.yacy.document.parser.html.TransformerWriter;
|
||||
import net.yacy.kelondro.data.meta.DigestURI;
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.logging.Log;
|
||||
import net.yacy.kelondro.util.DateFormatter;
|
||||
import net.yacy.kelondro.util.FileUtils;
|
||||
|
||||
public class BookmarkHelper {
|
||||
|
||||
public static String cleanTagsString(String tagsString) {
|
||||
|
||||
// get rid of heading, trailing and double commas since they are useless
|
||||
while (tagsString.length() > 0 && tagsString.charAt(0) == ',') {
|
||||
tagsString = tagsString.substring(1);
|
||||
}
|
||||
while (tagsString.endsWith(",")) {
|
||||
tagsString = tagsString.substring(0,tagsString.length() -1);
|
||||
}
|
||||
while (tagsString.contains(",,")){
|
||||
tagsString = tagsString.replaceAll(",,", ",");
|
||||
}
|
||||
// get rid of double and trailing slashes
|
||||
while (tagsString.endsWith("/")){
|
||||
tagsString = tagsString.substring(0, tagsString.length() -1);
|
||||
}
|
||||
while (tagsString.contains("/,")){
|
||||
tagsString = tagsString.replaceAll("/,", ",");
|
||||
}
|
||||
while (tagsString.contains("//")){
|
||||
tagsString = tagsString.replaceAll("//", "/");
|
||||
}
|
||||
// space characters following a comma are removed
|
||||
tagsString = tagsString.replaceAll(",\\s+", ",");
|
||||
|
||||
return tagsString;
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* returns an object of type String that contains a tagHash
|
||||
* @param tagName an object of type String with the name of the tag.
|
||||
* tagName is converted to lower case before hash is generated!
|
||||
*/
|
||||
public static String tagHash(final String tagName){
|
||||
return new String(Word.word2hash(tagName.toLowerCase()));
|
||||
}
|
||||
/*
|
||||
private static String tagHash(final String tagName, final String user){
|
||||
return new String(Word.word2hash(user+":"+tagName.toLowerCase()));
|
||||
}
|
||||
*/
|
||||
|
||||
|
||||
|
||||
// --------------------------------------
|
||||
// bookmarksDB's Import/Export functions
|
||||
// --------------------------------------
|
||||
|
||||
public static int importFromBookmarks(bookmarksDB db, final DigestURI baseURL, final String input, final String tag, final boolean importPublic){
|
||||
try {
|
||||
// convert string to input stream
|
||||
final ByteArrayInputStream byteIn = new ByteArrayInputStream(input.getBytes("UTF-8"));
|
||||
final InputStreamReader reader = new InputStreamReader(byteIn,"UTF-8");
|
||||
|
||||
// import stream
|
||||
return importFromBookmarks(db, baseURL, reader, tag, importPublic);
|
||||
} catch (final UnsupportedEncodingException e) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
private static int importFromBookmarks(bookmarksDB db, final DigestURI baseURL, final InputStreamReader input, final String tag, final boolean importPublic){
|
||||
|
||||
int importCount = 0;
|
||||
|
||||
Map<DigestURI, String> links = new HashMap<DigestURI, String>();
|
||||
String title;
|
||||
DigestURI url;
|
||||
Bookmark bm;
|
||||
final Set<String> tags=listManager.string2set(tag); //this allow multiple default tags
|
||||
try {
|
||||
//load the links
|
||||
final ContentScraper scraper = new ContentScraper(baseURL);
|
||||
//OutputStream os = new htmlFilterOutputStream(null, scraper, null, false);
|
||||
final Writer writer= new TransformerWriter(null,null,scraper, null, false);
|
||||
FileUtils.copy(input,writer);
|
||||
writer.close();
|
||||
links = scraper.getAnchors();
|
||||
} catch (final IOException e) { Log.logWarning("BOOKMARKS", "error during load of links: "+ e.getClass() +" "+ e.getMessage());}
|
||||
for (Entry<DigestURI, String> link: links.entrySet()) {
|
||||
url= link.getKey();
|
||||
title=link.getValue();
|
||||
Log.logInfo("BOOKMARKS", "links.get(url)");
|
||||
if(title.equals("")){//cannot be displayed
|
||||
title=url.toString();
|
||||
}
|
||||
bm=db.new Bookmark(url.toString());
|
||||
bm.setProperty(Bookmark.BOOKMARK_TITLE, title);
|
||||
bm.setTags(tags);
|
||||
bm.setPublic(importPublic);
|
||||
db.saveBookmark(bm);
|
||||
|
||||
importCount++;
|
||||
}
|
||||
|
||||
db.flushTagCache();
|
||||
|
||||
return importCount;
|
||||
}
|
||||
|
||||
|
||||
public static int importFromXML(bookmarksDB db, final String input, final boolean importPublic){
|
||||
try {
|
||||
// convert string to input stream
|
||||
final ByteArrayInputStream byteIn = new ByteArrayInputStream(input.getBytes("UTF-8"));
|
||||
|
||||
// import stream
|
||||
return importFromXML(db, byteIn,importPublic);
|
||||
} catch (final UnsupportedEncodingException e) {
|
||||
return 0;
|
||||
}
|
||||
}
|
||||
|
||||
private static int importFromXML(bookmarksDB db, final InputStream input, final boolean importPublic){
|
||||
final DocumentBuilderFactory factory=DocumentBuilderFactory.newInstance();
|
||||
factory.setValidating(false);
|
||||
factory.setNamespaceAware(false);
|
||||
DocumentBuilder builder;
|
||||
try {
|
||||
builder = factory.newDocumentBuilder();
|
||||
final Document doc=builder.parse(input);
|
||||
return parseXMLimport(db, doc, importPublic);
|
||||
} catch (final ParserConfigurationException e) {
|
||||
} catch (final SAXException e) {
|
||||
} catch (final IOException e) {
|
||||
}
|
||||
return 0;
|
||||
|
||||
}
|
||||
|
||||
private static int parseXMLimport(bookmarksDB db, final Node doc, final boolean importPublic){
|
||||
int importCount = 0;
|
||||
if (doc.getNodeName().equals("post")) {
|
||||
final NamedNodeMap attributes = doc.getAttributes();
|
||||
final String url=attributes.getNamedItem("href").getNodeValue();
|
||||
if(url.equals("")){
|
||||
return 0;
|
||||
}
|
||||
final Bookmark bm=db.new Bookmark(url);
|
||||
String tagsString="";
|
||||
String title="";
|
||||
String description="";
|
||||
String time="";
|
||||
if(attributes.getNamedItem("tag")!=null){
|
||||
tagsString=attributes.getNamedItem("tag").getNodeValue();
|
||||
}
|
||||
if(attributes.getNamedItem("description")!=null){
|
||||
title=attributes.getNamedItem("description").getNodeValue();
|
||||
}
|
||||
if(attributes.getNamedItem("extended")!=null){
|
||||
description=attributes.getNamedItem("extended").getNodeValue();
|
||||
}
|
||||
if(attributes.getNamedItem("time")!=null){
|
||||
time=attributes.getNamedItem("time").getNodeValue();
|
||||
}
|
||||
Set<String> tags=new HashSet<String>();
|
||||
|
||||
if(title != null){
|
||||
bm.setProperty(Bookmark.BOOKMARK_TITLE, title);
|
||||
}
|
||||
if(tagsString!=null){
|
||||
tags = listManager.string2set(tagsString.replace(' ', ','));
|
||||
}
|
||||
bm.setTags(tags, true);
|
||||
if(time != null){
|
||||
|
||||
Date parsedDate = null;
|
||||
try {
|
||||
parsedDate = DateFormatter.parseISO8601(time);
|
||||
} catch (final ParseException e) {
|
||||
parsedDate = new Date();
|
||||
}
|
||||
bm.setTimeStamp(parsedDate.getTime());
|
||||
}
|
||||
if(description!=null){
|
||||
bm.setProperty(Bookmark.BOOKMARK_DESCRIPTION, description);
|
||||
}
|
||||
bm.setPublic(importPublic);
|
||||
db.saveBookmark(bm);
|
||||
|
||||
importCount++;
|
||||
}
|
||||
final NodeList children=doc.getChildNodes();
|
||||
if(children != null){
|
||||
for (int i=0; i<children.getLength(); i++) {
|
||||
importCount += parseXMLimport(db, children.item(i), importPublic);
|
||||
}
|
||||
}
|
||||
db.flushTagCache();
|
||||
|
||||
return importCount;
|
||||
}
|
||||
}
|
Loading…
Reference in new issue