added stub classes for

- a new database for url reference data ('seen links')
- a new database extending the references to the full url metadata
attributes set which shall replace the old metadata database if it is
finished
- migration help classes stub to use old and new metadata databases
simultanously
pull/1/head
Michael Christen 13 years ago
parent ac5d124ee0
commit e32055aa15

@ -0,0 +1,62 @@
/**
* MetadataVocabulary
* Copyright 2012 by Michael Peter Christen
* First released 12.4.2012 at http://yacy.net
*
* This file is part of YaCy Content Integration
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.kelondro.data.meta;
import java.util.Set;
import net.yacy.cora.lod.Literal;
import net.yacy.cora.lod.Vocabulary;
public enum MetadataVocabulary implements Vocabulary {
moddate, url;
public final static String IDENTIFIER = "http://yacy.net/metadata";
public final static String PREFIX = "ym";
private final String predicate;
private MetadataVocabulary() {
this.predicate = PREFIX + ":" + this.name().toLowerCase();
}
@Override
public String getIdentifier() {
return IDENTIFIER;
}
@Override
public String getPrefix() {
return PREFIX;
}
@Override
public String getPredicate() {
return this.predicate;
}
@Override
public Set<Literal> getLiterals() {
return null;
}
}

@ -1,55 +1,53 @@
// URIMetadata.java
// (C) 2009 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
// first published 03.04.2009 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
/**
* URIMetadata
* Copyright 2012 by Michael Peter Christen
* First released 3.4.2012 at http://yacy.net
*
* This file is part of YaCy Content Integration
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.kelondro.data.meta;
import java.util.Date;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.order.Bitfield;
import net.yacy.kelondro.rwi.Reference;
public interface URIMetadata {
public interface URIMetadata extends URIReference {
public String dc_title();
public Row.Entry toRowEntry();
public byte[] hash();
public long ranking();
public String dc_creator();
public String dc_publisher();
public String dc_subject();
public float lat();
public Date moddate();
public float lon();
public long ranking();
public Date loaddate();
public Date freshdate();
public byte[] referrerHash();
public String md5();
public char doctype();
@ -79,10 +77,7 @@ public interface URIMetadata {
public Reference word();
public boolean isOlder(final URIMetadata other);
public String toString(final String snippet);
@Override
public String toString();
public String toString(final String snippet);
}

@ -1,3 +1,25 @@
/**
* URIMetadataNode
* Copyright 2012 by Michael Peter Christen
* First released 3.4.2012 at http://yacy.net
*
* This file is part of YaCy Content Integration
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.kelondro.data.meta;
import net.yacy.cora.lod.Node;
@ -5,7 +27,7 @@ import net.yacy.cora.lod.vocabulary.Rdf;
import net.yacy.kelondro.data.word.WordReferenceVars;
public class URIMetadataNode /*implements URIMetadata*/ {
public class URIMetadataNode /*extends URIReferenceNode implements URIMetadata*/ {
private final Node entry;
private final String snippet;

@ -30,6 +30,7 @@ import java.net.MalformedURLException;
import java.text.ParseException;
import java.util.Date;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.regex.Pattern;
@ -166,6 +167,12 @@ public class URIMetadataRow implements URIMetadata {
this.comp = null;
}
@Override
public Map<String, byte[]> toMap() {
// TODO to be implemented
return null;
}
private void encodeDate(final int col, final Date d) {
// calculates the number of days since 1.1.1970 and returns this as 4-byte array
// 86400000 is the number of milliseconds in one day
@ -642,4 +649,5 @@ public class URIMetadataRow implements URIMetadata {
return p < 0 ? 0.0f : Float.parseFloat(this.latlon.substring(p + 1));
}
}
}

@ -0,0 +1,71 @@
/**
* URIReference
* Copyright 2012 by Michael Peter Christen
* First released 3.4.2012 at http://yacy.net
*
* This file is part of YaCy Content Integration
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.kelondro.data.meta;
import java.util.Date;
import java.util.Map;
import java.util.regex.Pattern;
public interface URIReference {
/**
* The hash of a URIReference is a unique key for the stored URL.
* It is in fact equal to url().hash()
* @return the hash of the stored url
*/
public byte[] hash();
/**
* The modification date of the URIReference is given if
* the record was created first and is defined with the
* creation date. If the record is modified later, the date shall change.
* @return the modification date of this record
*/
public Date moddate();
/**
* The DigestURI is the payload of the URIReference
* @return the url as DigestURI with assigned URL hash according to the record hash
*/
public DigestURI url();
/**
* check if the url matches agains a given matcher
* @param matcher
* @return true if the url() matches
*/
public boolean matches(final Pattern matcher);
/**
* transform the record into a map which can be stored
* @return
*/
public Map<String, byte[]> toMap();
/**
* produce a visible representation of the record
* @return a string for the url()
*/
@Override
public String toString();
}

@ -0,0 +1,84 @@
/**
* URIReferenceNode
* Copyright 2012 by Michael Peter Christen
* First released 5.4.2012 at http://yacy.net
*
* This file is part of YaCy Content Integration
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.kelondro.data.meta;
import java.net.MalformedURLException;
import java.text.ParseException;
import java.util.Date;
import java.util.HashMap;
import java.util.Map;
import java.util.regex.Pattern;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.ASCII;
public class URIReferenceNode extends HashMap<String, byte[]> implements URIReference {
private static final long serialVersionUID = -1580155759116466570L;
private byte[] hash;
public URIReferenceNode(DigestURI uri, Date date) {
this.hash = uri.hash();
this.put(MetadataVocabulary.url.name(), ASCII.getBytes(uri.toNormalform(true, false)));
this.put(MetadataVocabulary.moddate.name(), ASCII.getBytes(ISO8601Formatter.FORMATTER.format(date)));
}
@Override
public byte[] hash() {
return this.hash;
}
@Override
public Date moddate() {
byte[] x = this.get(MetadataVocabulary.moddate.name());
try {
return x == null ? null : ISO8601Formatter.FORMATTER.parse(ASCII.String(x));
} catch (ParseException e) {
return null;
}
}
@Override
public DigestURI url() {
byte[] x = this.get(MetadataVocabulary.moddate.name());
try {
return x == null ? null : new DigestURI(ASCII.String(x), this.hash);
} catch (MalformedURLException e) {
return null;
}
}
@Override
public boolean matches(Pattern matcher) {
byte[] x = this.get(MetadataVocabulary.moddate.name());
if (x == null) return false;
return matcher.matcher(ASCII.String(x)).matches();
}
@Override
public Map<String, byte[]> toMap() {
return this;
}
}

@ -0,0 +1,119 @@
/**
* DocumentMetadata
* Copyright 2012 by Michael Peter Christen
* First released 3.4.2012 at http://yacy.net
*
* This file is part of YaCy Content Integration
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.search.index;
import java.io.IOException;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue.Element;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.word.WordReference;
public class DocumentMetadata implements Metadata {
@Override
public void clearCache() {
// TODO Auto-generated method stub
}
@Override
public void clear() throws IOException {
// TODO Auto-generated method stub
}
@Override
public int size() {
// TODO Auto-generated method stub
return 0;
}
@Override
public void close() {
// TODO Auto-generated method stub
}
@Override
public int writeCacheSize() {
// TODO Auto-generated method stub
return 0;
}
@Override
public URIMetadata load(Element<WordReference> obrwi) {
// TODO Auto-generated method stub
return null;
}
@Override
public URIMetadata load(byte[] urlHash) {
// TODO Auto-generated method stub
return null;
}
@Override
public void store(URIMetadata entry) throws IOException {
// TODO Auto-generated method stub
}
@Override
public boolean remove(byte[] urlHashBytes) {
// TODO Auto-generated method stub
return false;
}
@Override
public boolean exists(byte[] urlHash) {
// TODO Auto-generated method stub
return false;
}
@Override
public CloneableIterator<byte[]> keys(boolean up, byte[] firstKey) {
// TODO Auto-generated method stub
return null;
}
@Override
public CloneableIterator<URIMetadata> entries() throws IOException {
// TODO Auto-generated method stub
return null;
}
@Override
public CloneableIterator<URIMetadata> entries(boolean up, String firstHash)
throws IOException {
// TODO Auto-generated method stub
return null;
}
@Override
public int deleteDomain(String hosthash) throws IOException {
// TODO Auto-generated method stub
return 0;
}
}

@ -0,0 +1,94 @@
/**
* DocumentReference
* Copyright 2012 by Michael Peter Christen
* First released 3.4.2012 at http://yacy.net
*
* This file is part of YaCy Content Integration
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.search.index;
import java.io.IOException;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
import net.yacy.cora.storage.MapStore;
import net.yacy.kelondro.data.meta.URIReference;
import net.yacy.kelondro.data.word.WordReference;
public class DocumentReference {
public MapStore data;
public void clear() throws IOException {
data.clear();
}
public int size() {
return data.size();
}
public void close() {
if (data != null) {
data.close();
}
data = null;
}
public void store(final URIReference entry) throws IOException {
data.put(entry.hash(), entry.toMap());
}
public URIReference load(final WeakPriorityBlockingQueue.Element<WordReference> obrwi) {
return null;
}
public URIReference load(final byte[] urlHash){
return null;
}
public boolean remove(final byte[] urlHashBytes) {
return false;
}
public boolean exists(final byte[] urlHash) {
return false;
}
public CloneableIterator<byte[]> keys(final boolean up, final byte[] firstKey) {
return null;
}
public CloneableIterator<URIReference> entries() throws IOException {
return null;
}
public CloneableIterator<URIReference> entries(final boolean up, final String firstHash) throws IOException {
return null;
}
/**
* using a fragment of the url hash (5 bytes: bytes 6 to 10) it is possible to address all urls from a specific domain
* here such a fragment can be used to delete all these domains at once
* @param hosthash
* @return number of deleted domains
* @throws IOException
*/
public int deleteDomain(final String hosthash) throws IOException {
return -1;
}
}

@ -0,0 +1,68 @@
/**
* Metadata
* Copyright 2012 by Michael Peter Christen
* First released 3.4.2012 at http://yacy.net
*
* This file is part of YaCy Content Integration
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.search.index;
import java.io.IOException;
import net.yacy.cora.order.CloneableIterator;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
import net.yacy.kelondro.data.meta.URIMetadata;
import net.yacy.kelondro.data.word.WordReference;
public interface Metadata {
public void clearCache();
public void clear() throws IOException;
public int size();
public void close();
public int writeCacheSize();
public URIMetadata load(final WeakPriorityBlockingQueue.Element<WordReference> obrwi);
public URIMetadata load(final byte[] urlHash);
public void store(final URIMetadata entry) throws IOException;
public boolean remove(final byte[] urlHashBytes);
public boolean exists(final byte[] urlHash);
public CloneableIterator<byte[]> keys(final boolean up, final byte[] firstKey);
public CloneableIterator<URIMetadata> entries() throws IOException;
public CloneableIterator<URIMetadata> entries(final boolean up, final String firstHash) throws IOException;
/**
* using a fragment of the url hash (5 bytes: bytes 6 to 10) it is possible to address all urls from a specific domain
* here such a fragment can be used to delete all these domains at once
* @param hosthash
* @return number of deleted domains
* @throws IOException
*/
public int deleteDomain(final String hosthash) throws IOException;
}

@ -0,0 +1,34 @@
/**
* MetadataMigration
* Copyright 2012 by Michael Peter Christen
* First released 3.4.2012 at http://yacy.net
*
* This file is part of YaCy Content Integration
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.search.index;
/*
* migration class that combines old MetadataRepository classes
* with the new DocumentMetadata class
*/
public class MetadataMigration /*implements Metadata*/ {
private MetadataRepository metadataRepository;
private DocumentMetadata documentMetadata;
}

@ -62,7 +62,7 @@ import net.yacy.kelondro.util.MemoryControl;
import net.yacy.repository.Blacklist;
import de.anomic.crawler.CrawlStacker;
public final class MetadataRepository implements Iterable<byte[]> {
public final class MetadataRepository implements /*Metadata,*/ Iterable<byte[]> {
// class objects
protected Index urlIndexFile;

Loading…
Cancel
Save