From e32055aa15822a04bec61072c4e525af9021f183 Mon Sep 17 00:00:00 2001 From: Michael Christen Date: Fri, 13 Apr 2012 07:09:15 +0200 Subject: [PATCH] added stub classes for - a new database for url reference data ('seen links') - a new database extending the references to the full url metadata attributes set which shall replace the old metadata database if it is finished - migration help classes stub to use old and new metadata databases simultanously --- .../data/meta/MetadataVocabulary.java | 62 +++++++++ .../yacy/kelondro/data/meta/URIMetadata.java | 73 +++++------ .../kelondro/data/meta/URIMetadataNode.java | 24 +++- .../kelondro/data/meta/URIMetadataRow.java | 8 ++ .../yacy/kelondro/data/meta/URIReference.java | 71 +++++++++++ .../kelondro/data/meta/URIReferenceNode.java | 84 +++++++++++++ .../yacy/search/index/DocumentMetadata.java | 119 ++++++++++++++++++ .../yacy/search/index/DocumentReference.java | 94 ++++++++++++++ source/net/yacy/search/index/Metadata.java | 68 ++++++++++ .../yacy/search/index/MetadataMigration.java | 34 +++++ .../yacy/search/index/MetadataRepository.java | 2 +- 11 files changed, 598 insertions(+), 41 deletions(-) create mode 100644 source/net/yacy/kelondro/data/meta/MetadataVocabulary.java create mode 100644 source/net/yacy/kelondro/data/meta/URIReference.java create mode 100644 source/net/yacy/kelondro/data/meta/URIReferenceNode.java create mode 100644 source/net/yacy/search/index/DocumentMetadata.java create mode 100644 source/net/yacy/search/index/DocumentReference.java create mode 100644 source/net/yacy/search/index/Metadata.java create mode 100644 source/net/yacy/search/index/MetadataMigration.java diff --git a/source/net/yacy/kelondro/data/meta/MetadataVocabulary.java b/source/net/yacy/kelondro/data/meta/MetadataVocabulary.java new file mode 100644 index 000000000..bfdc8bac4 --- /dev/null +++ b/source/net/yacy/kelondro/data/meta/MetadataVocabulary.java @@ -0,0 +1,62 @@ +/** + * MetadataVocabulary + * Copyright 2012 by Michael Peter Christen + * First released 12.4.2012 at http://yacy.net + * + * This file is part of YaCy Content Integration + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.kelondro.data.meta; + +import java.util.Set; + +import net.yacy.cora.lod.Literal; +import net.yacy.cora.lod.Vocabulary; + +public enum MetadataVocabulary implements Vocabulary { + + moddate, url; + + public final static String IDENTIFIER = "http://yacy.net/metadata"; + public final static String PREFIX = "ym"; + + private final String predicate; + + private MetadataVocabulary() { + this.predicate = PREFIX + ":" + this.name().toLowerCase(); + } + + @Override + public String getIdentifier() { + return IDENTIFIER; + } + + @Override + public String getPrefix() { + return PREFIX; + } + + @Override + public String getPredicate() { + return this.predicate; + } + + @Override + public Set getLiterals() { + return null; + } +} diff --git a/source/net/yacy/kelondro/data/meta/URIMetadata.java b/source/net/yacy/kelondro/data/meta/URIMetadata.java index fe1b41cfa..27a0a446c 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadata.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadata.java @@ -1,55 +1,53 @@ -// URIMetadata.java -// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany -// first published 03.04.2009 on http://yacy.net -// -// This is a part of YaCy, a peer-to-peer based web search engine -// -// $LastChangedDate$ -// $LastChangedRevision$ -// $LastChangedBy$ -// -// LICENSE -// -// This program is free software; you can redistribute it and/or modify -// it under the terms of the GNU General Public License as published by -// the Free Software Foundation; either version 2 of the License, or -// (at your option) any later version. -// -// This program is distributed in the hope that it will be useful, -// but WITHOUT ANY WARRANTY; without even the implied warranty of -// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -// GNU General Public License for more details. -// -// You should have received a copy of the GNU General Public License -// along with this program; if not, write to the Free Software -// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA +/** + * URIMetadata + * Copyright 2012 by Michael Peter Christen + * First released 3.4.2012 at http://yacy.net + * + * This file is part of YaCy Content Integration + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ package net.yacy.kelondro.data.meta; import java.util.Date; -import net.yacy.kelondro.index.Row; import net.yacy.kelondro.order.Bitfield; import net.yacy.kelondro.rwi.Reference; -public interface URIMetadata { +public interface URIMetadata extends URIReference { + public String dc_title(); - public Row.Entry toRowEntry(); - - public byte[] hash(); - - public long ranking(); + public String dc_creator(); + + public String dc_publisher(); + + public String dc_subject(); + + public float lat(); - public Date moddate(); + public float lon(); + + public long ranking(); public Date loaddate(); public Date freshdate(); - public byte[] referrerHash(); - public String md5(); public char doctype(); @@ -79,10 +77,7 @@ public interface URIMetadata { public Reference word(); public boolean isOlder(final URIMetadata other); - - public String toString(final String snippet); - @Override - public String toString(); + public String toString(final String snippet); } diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java index dfd2d66f1..e110337cb 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataNode.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataNode.java @@ -1,3 +1,25 @@ +/** + * URIMetadataNode + * Copyright 2012 by Michael Peter Christen + * First released 3.4.2012 at http://yacy.net + * + * This file is part of YaCy Content Integration + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + package net.yacy.kelondro.data.meta; import net.yacy.cora.lod.Node; @@ -5,7 +27,7 @@ import net.yacy.cora.lod.vocabulary.Rdf; import net.yacy.kelondro.data.word.WordReferenceVars; -public class URIMetadataNode /*implements URIMetadata*/ { +public class URIMetadataNode /*extends URIReferenceNode implements URIMetadata*/ { private final Node entry; private final String snippet; diff --git a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java index dbef4fe98..3ab29c0d3 100644 --- a/source/net/yacy/kelondro/data/meta/URIMetadataRow.java +++ b/source/net/yacy/kelondro/data/meta/URIMetadataRow.java @@ -30,6 +30,7 @@ import java.net.MalformedURLException; import java.text.ParseException; import java.util.Date; import java.util.List; +import java.util.Map; import java.util.Properties; import java.util.regex.Pattern; @@ -166,6 +167,12 @@ public class URIMetadataRow implements URIMetadata { this.comp = null; } + @Override + public Map toMap() { + // TODO to be implemented + return null; + } + private void encodeDate(final int col, final Date d) { // calculates the number of days since 1.1.1970 and returns this as 4-byte array // 86400000 is the number of milliseconds in one day @@ -642,4 +649,5 @@ public class URIMetadataRow implements URIMetadata { return p < 0 ? 0.0f : Float.parseFloat(this.latlon.substring(p + 1)); } } + } diff --git a/source/net/yacy/kelondro/data/meta/URIReference.java b/source/net/yacy/kelondro/data/meta/URIReference.java new file mode 100644 index 000000000..d2ba476a3 --- /dev/null +++ b/source/net/yacy/kelondro/data/meta/URIReference.java @@ -0,0 +1,71 @@ +/** + * URIReference + * Copyright 2012 by Michael Peter Christen + * First released 3.4.2012 at http://yacy.net + * + * This file is part of YaCy Content Integration + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.kelondro.data.meta; + +import java.util.Date; +import java.util.Map; +import java.util.regex.Pattern; + +public interface URIReference { + + /** + * The hash of a URIReference is a unique key for the stored URL. + * It is in fact equal to url().hash() + * @return the hash of the stored url + */ + public byte[] hash(); + + /** + * The modification date of the URIReference is given if + * the record was created first and is defined with the + * creation date. If the record is modified later, the date shall change. + * @return the modification date of this record + */ + public Date moddate(); + + /** + * The DigestURI is the payload of the URIReference + * @return the url as DigestURI with assigned URL hash according to the record hash + */ + public DigestURI url(); + + /** + * check if the url matches agains a given matcher + * @param matcher + * @return true if the url() matches + */ + public boolean matches(final Pattern matcher); + + /** + * transform the record into a map which can be stored + * @return + */ + public Map toMap(); + + /** + * produce a visible representation of the record + * @return a string for the url() + */ + @Override + public String toString(); +} diff --git a/source/net/yacy/kelondro/data/meta/URIReferenceNode.java b/source/net/yacy/kelondro/data/meta/URIReferenceNode.java new file mode 100644 index 000000000..bacdfe998 --- /dev/null +++ b/source/net/yacy/kelondro/data/meta/URIReferenceNode.java @@ -0,0 +1,84 @@ +/** + * URIReferenceNode + * Copyright 2012 by Michael Peter Christen + * First released 5.4.2012 at http://yacy.net + * + * This file is part of YaCy Content Integration + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.kelondro.data.meta; + +import java.net.MalformedURLException; +import java.text.ParseException; +import java.util.Date; +import java.util.HashMap; +import java.util.Map; +import java.util.regex.Pattern; + +import net.yacy.cora.date.ISO8601Formatter; +import net.yacy.cora.document.ASCII; + +public class URIReferenceNode extends HashMap implements URIReference { + + private static final long serialVersionUID = -1580155759116466570L; + + private byte[] hash; + + public URIReferenceNode(DigestURI uri, Date date) { + this.hash = uri.hash(); + this.put(MetadataVocabulary.url.name(), ASCII.getBytes(uri.toNormalform(true, false))); + this.put(MetadataVocabulary.moddate.name(), ASCII.getBytes(ISO8601Formatter.FORMATTER.format(date))); + } + + @Override + public byte[] hash() { + return this.hash; + } + + @Override + public Date moddate() { + byte[] x = this.get(MetadataVocabulary.moddate.name()); + try { + return x == null ? null : ISO8601Formatter.FORMATTER.parse(ASCII.String(x)); + } catch (ParseException e) { + return null; + } + } + + @Override + public DigestURI url() { + byte[] x = this.get(MetadataVocabulary.moddate.name()); + try { + return x == null ? null : new DigestURI(ASCII.String(x), this.hash); + } catch (MalformedURLException e) { + return null; + } + } + + @Override + public boolean matches(Pattern matcher) { + byte[] x = this.get(MetadataVocabulary.moddate.name()); + if (x == null) return false; + return matcher.matcher(ASCII.String(x)).matches(); + } + + @Override + public Map toMap() { + return this; + } + +} diff --git a/source/net/yacy/search/index/DocumentMetadata.java b/source/net/yacy/search/index/DocumentMetadata.java new file mode 100644 index 000000000..b3cce470b --- /dev/null +++ b/source/net/yacy/search/index/DocumentMetadata.java @@ -0,0 +1,119 @@ +/** + * DocumentMetadata + * Copyright 2012 by Michael Peter Christen + * First released 3.4.2012 at http://yacy.net + * + * This file is part of YaCy Content Integration + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.search.index; + +import java.io.IOException; + +import net.yacy.cora.order.CloneableIterator; +import net.yacy.cora.sorting.WeakPriorityBlockingQueue.Element; +import net.yacy.kelondro.data.meta.URIMetadata; +import net.yacy.kelondro.data.word.WordReference; + +public class DocumentMetadata implements Metadata { + + @Override + public void clearCache() { + // TODO Auto-generated method stub + + } + + @Override + public void clear() throws IOException { + // TODO Auto-generated method stub + + } + + @Override + public int size() { + // TODO Auto-generated method stub + return 0; + } + + @Override + public void close() { + // TODO Auto-generated method stub + + } + + @Override + public int writeCacheSize() { + // TODO Auto-generated method stub + return 0; + } + + @Override + public URIMetadata load(Element obrwi) { + // TODO Auto-generated method stub + return null; + } + + @Override + public URIMetadata load(byte[] urlHash) { + // TODO Auto-generated method stub + return null; + } + + @Override + public void store(URIMetadata entry) throws IOException { + // TODO Auto-generated method stub + + } + + @Override + public boolean remove(byte[] urlHashBytes) { + // TODO Auto-generated method stub + return false; + } + + @Override + public boolean exists(byte[] urlHash) { + // TODO Auto-generated method stub + return false; + } + + @Override + public CloneableIterator keys(boolean up, byte[] firstKey) { + // TODO Auto-generated method stub + return null; + } + + @Override + public CloneableIterator entries() throws IOException { + // TODO Auto-generated method stub + return null; + } + + @Override + public CloneableIterator entries(boolean up, String firstHash) + throws IOException { + // TODO Auto-generated method stub + return null; + } + + @Override + public int deleteDomain(String hosthash) throws IOException { + // TODO Auto-generated method stub + return 0; + } + +} diff --git a/source/net/yacy/search/index/DocumentReference.java b/source/net/yacy/search/index/DocumentReference.java new file mode 100644 index 000000000..51906c65e --- /dev/null +++ b/source/net/yacy/search/index/DocumentReference.java @@ -0,0 +1,94 @@ +/** + * DocumentReference + * Copyright 2012 by Michael Peter Christen + * First released 3.4.2012 at http://yacy.net + * + * This file is part of YaCy Content Integration + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.search.index; + +import java.io.IOException; + +import net.yacy.cora.order.CloneableIterator; +import net.yacy.cora.sorting.WeakPriorityBlockingQueue; +import net.yacy.cora.storage.MapStore; +import net.yacy.kelondro.data.meta.URIReference; +import net.yacy.kelondro.data.word.WordReference; + +public class DocumentReference { + + public MapStore data; + + public void clear() throws IOException { + data.clear(); + } + + public int size() { + return data.size(); + } + + public void close() { + if (data != null) { + data.close(); + } + data = null; + } + + public void store(final URIReference entry) throws IOException { + data.put(entry.hash(), entry.toMap()); + } + + public URIReference load(final WeakPriorityBlockingQueue.Element obrwi) { + return null; + } + + public URIReference load(final byte[] urlHash){ + return null; + } + + public boolean remove(final byte[] urlHashBytes) { + return false; + } + + public boolean exists(final byte[] urlHash) { + return false; + } + + public CloneableIterator keys(final boolean up, final byte[] firstKey) { + return null; + } + + public CloneableIterator entries() throws IOException { + return null; + } + + public CloneableIterator entries(final boolean up, final String firstHash) throws IOException { + return null; + } + + /** + * using a fragment of the url hash (5 bytes: bytes 6 to 10) it is possible to address all urls from a specific domain + * here such a fragment can be used to delete all these domains at once + * @param hosthash + * @return number of deleted domains + * @throws IOException + */ + public int deleteDomain(final String hosthash) throws IOException { + return -1; + } +} diff --git a/source/net/yacy/search/index/Metadata.java b/source/net/yacy/search/index/Metadata.java new file mode 100644 index 000000000..96263296f --- /dev/null +++ b/source/net/yacy/search/index/Metadata.java @@ -0,0 +1,68 @@ +/** + * Metadata + * Copyright 2012 by Michael Peter Christen + * First released 3.4.2012 at http://yacy.net + * + * This file is part of YaCy Content Integration + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.search.index; + +import java.io.IOException; + +import net.yacy.cora.order.CloneableIterator; +import net.yacy.cora.sorting.WeakPriorityBlockingQueue; +import net.yacy.kelondro.data.meta.URIMetadata; +import net.yacy.kelondro.data.word.WordReference; + +public interface Metadata { + + public void clearCache(); + + public void clear() throws IOException; + + public int size(); + + public void close(); + + public int writeCacheSize(); + + public URIMetadata load(final WeakPriorityBlockingQueue.Element obrwi); + + public URIMetadata load(final byte[] urlHash); + + public void store(final URIMetadata entry) throws IOException; + + public boolean remove(final byte[] urlHashBytes); + + public boolean exists(final byte[] urlHash); + + public CloneableIterator keys(final boolean up, final byte[] firstKey); + + public CloneableIterator entries() throws IOException; + + public CloneableIterator entries(final boolean up, final String firstHash) throws IOException; + + /** + * using a fragment of the url hash (5 bytes: bytes 6 to 10) it is possible to address all urls from a specific domain + * here such a fragment can be used to delete all these domains at once + * @param hosthash + * @return number of deleted domains + * @throws IOException + */ + public int deleteDomain(final String hosthash) throws IOException; +} diff --git a/source/net/yacy/search/index/MetadataMigration.java b/source/net/yacy/search/index/MetadataMigration.java new file mode 100644 index 000000000..21911762f --- /dev/null +++ b/source/net/yacy/search/index/MetadataMigration.java @@ -0,0 +1,34 @@ +/** + * MetadataMigration + * Copyright 2012 by Michael Peter Christen + * First released 3.4.2012 at http://yacy.net + * + * This file is part of YaCy Content Integration + * + * This library is free software; you can redistribute it and/or + * modify it under the terms of the GNU Lesser General Public + * License as published by the Free Software Foundation; either + * version 2.1 of the License, or (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this program in the file lgpl21.txt + * If not, see . + */ + +package net.yacy.search.index; + +/* + * migration class that combines old MetadataRepository classes + * with the new DocumentMetadata class + */ +public class MetadataMigration /*implements Metadata*/ { + + private MetadataRepository metadataRepository; + private DocumentMetadata documentMetadata; + +} diff --git a/source/net/yacy/search/index/MetadataRepository.java b/source/net/yacy/search/index/MetadataRepository.java index 5f953dde4..6bf2f8225 100644 --- a/source/net/yacy/search/index/MetadataRepository.java +++ b/source/net/yacy/search/index/MetadataRepository.java @@ -62,7 +62,7 @@ import net.yacy.kelondro.util.MemoryControl; import net.yacy.repository.Blacklist; import de.anomic.crawler.CrawlStacker; -public final class MetadataRepository implements Iterable { +public final class MetadataRepository implements /*Metadata,*/ Iterable { // class objects protected Index urlIndexFile;