- a new database for url reference data ('seen links') - a new database extending the references to the full url metadata attributes set which shall replace the old metadata database if it is finished - migration help classes stub to use old and new metadata databases simultanouslypull/1/head
parent
ac5d124ee0
commit
e32055aa15
@ -0,0 +1,62 @@
|
||||
/**
|
||||
* MetadataVocabulary
|
||||
* Copyright 2012 by Michael Peter Christen
|
||||
* First released 12.4.2012 at http://yacy.net
|
||||
*
|
||||
* This file is part of YaCy Content Integration
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.kelondro.data.meta;
|
||||
|
||||
import java.util.Set;
|
||||
|
||||
import net.yacy.cora.lod.Literal;
|
||||
import net.yacy.cora.lod.Vocabulary;
|
||||
|
||||
public enum MetadataVocabulary implements Vocabulary {
|
||||
|
||||
moddate, url;
|
||||
|
||||
public final static String IDENTIFIER = "http://yacy.net/metadata";
|
||||
public final static String PREFIX = "ym";
|
||||
|
||||
private final String predicate;
|
||||
|
||||
private MetadataVocabulary() {
|
||||
this.predicate = PREFIX + ":" + this.name().toLowerCase();
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getIdentifier() {
|
||||
return IDENTIFIER;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getPrefix() {
|
||||
return PREFIX;
|
||||
}
|
||||
|
||||
@Override
|
||||
public String getPredicate() {
|
||||
return this.predicate;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Set<Literal> getLiterals() {
|
||||
return null;
|
||||
}
|
||||
}
|
@ -0,0 +1,71 @@
|
||||
/**
|
||||
* URIReference
|
||||
* Copyright 2012 by Michael Peter Christen
|
||||
* First released 3.4.2012 at http://yacy.net
|
||||
*
|
||||
* This file is part of YaCy Content Integration
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.kelondro.data.meta;
|
||||
|
||||
import java.util.Date;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
public interface URIReference {
|
||||
|
||||
/**
|
||||
* The hash of a URIReference is a unique key for the stored URL.
|
||||
* It is in fact equal to url().hash()
|
||||
* @return the hash of the stored url
|
||||
*/
|
||||
public byte[] hash();
|
||||
|
||||
/**
|
||||
* The modification date of the URIReference is given if
|
||||
* the record was created first and is defined with the
|
||||
* creation date. If the record is modified later, the date shall change.
|
||||
* @return the modification date of this record
|
||||
*/
|
||||
public Date moddate();
|
||||
|
||||
/**
|
||||
* The DigestURI is the payload of the URIReference
|
||||
* @return the url as DigestURI with assigned URL hash according to the record hash
|
||||
*/
|
||||
public DigestURI url();
|
||||
|
||||
/**
|
||||
* check if the url matches agains a given matcher
|
||||
* @param matcher
|
||||
* @return true if the url() matches
|
||||
*/
|
||||
public boolean matches(final Pattern matcher);
|
||||
|
||||
/**
|
||||
* transform the record into a map which can be stored
|
||||
* @return
|
||||
*/
|
||||
public Map<String, byte[]> toMap();
|
||||
|
||||
/**
|
||||
* produce a visible representation of the record
|
||||
* @return a string for the url()
|
||||
*/
|
||||
@Override
|
||||
public String toString();
|
||||
}
|
@ -0,0 +1,84 @@
|
||||
/**
|
||||
* URIReferenceNode
|
||||
* Copyright 2012 by Michael Peter Christen
|
||||
* First released 5.4.2012 at http://yacy.net
|
||||
*
|
||||
* This file is part of YaCy Content Integration
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.kelondro.data.meta;
|
||||
|
||||
import java.net.MalformedURLException;
|
||||
import java.text.ParseException;
|
||||
import java.util.Date;
|
||||
import java.util.HashMap;
|
||||
import java.util.Map;
|
||||
import java.util.regex.Pattern;
|
||||
|
||||
import net.yacy.cora.date.ISO8601Formatter;
|
||||
import net.yacy.cora.document.ASCII;
|
||||
|
||||
public class URIReferenceNode extends HashMap<String, byte[]> implements URIReference {
|
||||
|
||||
private static final long serialVersionUID = -1580155759116466570L;
|
||||
|
||||
private byte[] hash;
|
||||
|
||||
public URIReferenceNode(DigestURI uri, Date date) {
|
||||
this.hash = uri.hash();
|
||||
this.put(MetadataVocabulary.url.name(), ASCII.getBytes(uri.toNormalform(true, false)));
|
||||
this.put(MetadataVocabulary.moddate.name(), ASCII.getBytes(ISO8601Formatter.FORMATTER.format(date)));
|
||||
}
|
||||
|
||||
@Override
|
||||
public byte[] hash() {
|
||||
return this.hash;
|
||||
}
|
||||
|
||||
@Override
|
||||
public Date moddate() {
|
||||
byte[] x = this.get(MetadataVocabulary.moddate.name());
|
||||
try {
|
||||
return x == null ? null : ISO8601Formatter.FORMATTER.parse(ASCII.String(x));
|
||||
} catch (ParseException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public DigestURI url() {
|
||||
byte[] x = this.get(MetadataVocabulary.moddate.name());
|
||||
try {
|
||||
return x == null ? null : new DigestURI(ASCII.String(x), this.hash);
|
||||
} catch (MalformedURLException e) {
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean matches(Pattern matcher) {
|
||||
byte[] x = this.get(MetadataVocabulary.moddate.name());
|
||||
if (x == null) return false;
|
||||
return matcher.matcher(ASCII.String(x)).matches();
|
||||
}
|
||||
|
||||
@Override
|
||||
public Map<String, byte[]> toMap() {
|
||||
return this;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,119 @@
|
||||
/**
|
||||
* DocumentMetadata
|
||||
* Copyright 2012 by Michael Peter Christen
|
||||
* First released 3.4.2012 at http://yacy.net
|
||||
*
|
||||
* This file is part of YaCy Content Integration
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.search.index;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import net.yacy.cora.order.CloneableIterator;
|
||||
import net.yacy.cora.sorting.WeakPriorityBlockingQueue.Element;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.data.word.WordReference;
|
||||
|
||||
public class DocumentMetadata implements Metadata {
|
||||
|
||||
@Override
|
||||
public void clearCache() {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public void clear() throws IOException {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public int size() {
|
||||
// TODO Auto-generated method stub
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void close() {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public int writeCacheSize() {
|
||||
// TODO Auto-generated method stub
|
||||
return 0;
|
||||
}
|
||||
|
||||
@Override
|
||||
public URIMetadata load(Element<WordReference> obrwi) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public URIMetadata load(byte[] urlHash) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public void store(URIMetadata entry) throws IOException {
|
||||
// TODO Auto-generated method stub
|
||||
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean remove(byte[] urlHashBytes) {
|
||||
// TODO Auto-generated method stub
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public boolean exists(byte[] urlHash) {
|
||||
// TODO Auto-generated method stub
|
||||
return false;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CloneableIterator<byte[]> keys(boolean up, byte[] firstKey) {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CloneableIterator<URIMetadata> entries() throws IOException {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public CloneableIterator<URIMetadata> entries(boolean up, String firstHash)
|
||||
throws IOException {
|
||||
// TODO Auto-generated method stub
|
||||
return null;
|
||||
}
|
||||
|
||||
@Override
|
||||
public int deleteDomain(String hosthash) throws IOException {
|
||||
// TODO Auto-generated method stub
|
||||
return 0;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,94 @@
|
||||
/**
|
||||
* DocumentReference
|
||||
* Copyright 2012 by Michael Peter Christen
|
||||
* First released 3.4.2012 at http://yacy.net
|
||||
*
|
||||
* This file is part of YaCy Content Integration
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.search.index;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import net.yacy.cora.order.CloneableIterator;
|
||||
import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
|
||||
import net.yacy.cora.storage.MapStore;
|
||||
import net.yacy.kelondro.data.meta.URIReference;
|
||||
import net.yacy.kelondro.data.word.WordReference;
|
||||
|
||||
public class DocumentReference {
|
||||
|
||||
public MapStore data;
|
||||
|
||||
public void clear() throws IOException {
|
||||
data.clear();
|
||||
}
|
||||
|
||||
public int size() {
|
||||
return data.size();
|
||||
}
|
||||
|
||||
public void close() {
|
||||
if (data != null) {
|
||||
data.close();
|
||||
}
|
||||
data = null;
|
||||
}
|
||||
|
||||
public void store(final URIReference entry) throws IOException {
|
||||
data.put(entry.hash(), entry.toMap());
|
||||
}
|
||||
|
||||
public URIReference load(final WeakPriorityBlockingQueue.Element<WordReference> obrwi) {
|
||||
return null;
|
||||
}
|
||||
|
||||
public URIReference load(final byte[] urlHash){
|
||||
return null;
|
||||
}
|
||||
|
||||
public boolean remove(final byte[] urlHashBytes) {
|
||||
return false;
|
||||
}
|
||||
|
||||
public boolean exists(final byte[] urlHash) {
|
||||
return false;
|
||||
}
|
||||
|
||||
public CloneableIterator<byte[]> keys(final boolean up, final byte[] firstKey) {
|
||||
return null;
|
||||
}
|
||||
|
||||
public CloneableIterator<URIReference> entries() throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
public CloneableIterator<URIReference> entries(final boolean up, final String firstHash) throws IOException {
|
||||
return null;
|
||||
}
|
||||
|
||||
/**
|
||||
* using a fragment of the url hash (5 bytes: bytes 6 to 10) it is possible to address all urls from a specific domain
|
||||
* here such a fragment can be used to delete all these domains at once
|
||||
* @param hosthash
|
||||
* @return number of deleted domains
|
||||
* @throws IOException
|
||||
*/
|
||||
public int deleteDomain(final String hosthash) throws IOException {
|
||||
return -1;
|
||||
}
|
||||
}
|
@ -0,0 +1,68 @@
|
||||
/**
|
||||
* Metadata
|
||||
* Copyright 2012 by Michael Peter Christen
|
||||
* First released 3.4.2012 at http://yacy.net
|
||||
*
|
||||
* This file is part of YaCy Content Integration
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.search.index;
|
||||
|
||||
import java.io.IOException;
|
||||
|
||||
import net.yacy.cora.order.CloneableIterator;
|
||||
import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
|
||||
import net.yacy.kelondro.data.meta.URIMetadata;
|
||||
import net.yacy.kelondro.data.word.WordReference;
|
||||
|
||||
public interface Metadata {
|
||||
|
||||
public void clearCache();
|
||||
|
||||
public void clear() throws IOException;
|
||||
|
||||
public int size();
|
||||
|
||||
public void close();
|
||||
|
||||
public int writeCacheSize();
|
||||
|
||||
public URIMetadata load(final WeakPriorityBlockingQueue.Element<WordReference> obrwi);
|
||||
|
||||
public URIMetadata load(final byte[] urlHash);
|
||||
|
||||
public void store(final URIMetadata entry) throws IOException;
|
||||
|
||||
public boolean remove(final byte[] urlHashBytes);
|
||||
|
||||
public boolean exists(final byte[] urlHash);
|
||||
|
||||
public CloneableIterator<byte[]> keys(final boolean up, final byte[] firstKey);
|
||||
|
||||
public CloneableIterator<URIMetadata> entries() throws IOException;
|
||||
|
||||
public CloneableIterator<URIMetadata> entries(final boolean up, final String firstHash) throws IOException;
|
||||
|
||||
/**
|
||||
* using a fragment of the url hash (5 bytes: bytes 6 to 10) it is possible to address all urls from a specific domain
|
||||
* here such a fragment can be used to delete all these domains at once
|
||||
* @param hosthash
|
||||
* @return number of deleted domains
|
||||
* @throws IOException
|
||||
*/
|
||||
public int deleteDomain(final String hosthash) throws IOException;
|
||||
}
|
@ -0,0 +1,34 @@
|
||||
/**
|
||||
* MetadataMigration
|
||||
* Copyright 2012 by Michael Peter Christen
|
||||
* First released 3.4.2012 at http://yacy.net
|
||||
*
|
||||
* This file is part of YaCy Content Integration
|
||||
*
|
||||
* This library is free software; you can redistribute it and/or
|
||||
* modify it under the terms of the GNU Lesser General Public
|
||||
* License as published by the Free Software Foundation; either
|
||||
* version 2.1 of the License, or (at your option) any later version.
|
||||
*
|
||||
* This library is distributed in the hope that it will be useful,
|
||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
* Lesser General Public License for more details.
|
||||
*
|
||||
* You should have received a copy of the GNU Lesser General Public License
|
||||
* along with this program in the file lgpl21.txt
|
||||
* If not, see <http://www.gnu.org/licenses/>.
|
||||
*/
|
||||
|
||||
package net.yacy.search.index;
|
||||
|
||||
/*
|
||||
* migration class that combines old MetadataRepository classes
|
||||
* with the new DocumentMetadata class
|
||||
*/
|
||||
public class MetadataMigration /*implements Metadata*/ {
|
||||
|
||||
private MetadataRepository metadataRepository;
|
||||
private DocumentMetadata documentMetadata;
|
||||
|
||||
}
|
Loading…
Reference in new issue