git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@6608 6c8d7289-2bf4-0310-a012-ef5d649a1542pull/1/head
parent
c8aece34a4
commit
db19a941cf
@ -0,0 +1,64 @@
|
||||
// ImageReference.java
|
||||
// (C) 2010 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 21.01.2010 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2009-04-03 15:23:45 +0200 (Fr, 03 Apr 2009) $
|
||||
// $LastChangedRevision: 5777 $
|
||||
// $LastChangedBy: orbiter $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package net.yacy.kelondro.data.image;
|
||||
|
||||
import net.yacy.kelondro.order.Bitfield;
|
||||
import net.yacy.kelondro.rwi.Reference;
|
||||
|
||||
public interface ImageReference extends Reference {
|
||||
|
||||
public int virtualAge();
|
||||
|
||||
public int hitcount();
|
||||
|
||||
public int posinphrase();
|
||||
|
||||
public int posofphrase();
|
||||
|
||||
public int wordsintext();
|
||||
|
||||
public int phrasesintext();
|
||||
|
||||
public String getLanguage();
|
||||
|
||||
public char getType();
|
||||
|
||||
public int wordsintitle();
|
||||
|
||||
public int llocal();
|
||||
|
||||
public int lother();
|
||||
|
||||
public int urllength();
|
||||
|
||||
public int urlcomps();
|
||||
|
||||
public Bitfield flags();
|
||||
|
||||
public double termFrequency();
|
||||
|
||||
}
|
@ -0,0 +1,48 @@
|
||||
// ImageReferenceFactory.java
|
||||
// (C) 2010 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 21.01.2010 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2009-04-03 15:23:45 +0200 (Fr, 03 Apr 2009) $
|
||||
// $LastChangedRevision: 5777 $
|
||||
// $LastChangedBy: orbiter $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package net.yacy.kelondro.data.image;
|
||||
|
||||
import net.yacy.kelondro.index.Row;
|
||||
import net.yacy.kelondro.index.Row.Entry;
|
||||
import net.yacy.kelondro.rwi.ReferenceFactory;
|
||||
|
||||
public class ImageReferenceFactory implements ReferenceFactory<ImageReference> {
|
||||
|
||||
public ImageReference produceSlow(Entry e) {
|
||||
return null; //new ImageReferenceRow(e);
|
||||
}
|
||||
|
||||
public ImageReference produceFast(ImageReference r) {
|
||||
if (r instanceof ImageReferenceVars) return r;
|
||||
return new ImageReferenceVars(r);
|
||||
}
|
||||
|
||||
public Row getRow() {
|
||||
return ImageReferenceRow.urlEntryRow;
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,281 @@
|
||||
// ImageReferenceRow.java
|
||||
// (C) 2010 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 21.01.2010 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $
|
||||
// $LastChangedRevision: 5736 $
|
||||
// $LastChangedBy: borg-0300 $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package net.yacy.kelondro.data.image;
|
||||
|
||||
import net.yacy.kelondro.data.word.Word;
|
||||
import net.yacy.kelondro.index.Column;
|
||||
import net.yacy.kelondro.index.Row;
|
||||
import net.yacy.kelondro.index.Row.Entry;
|
||||
import net.yacy.kelondro.order.Base64Order;
|
||||
import net.yacy.kelondro.order.Bitfield;
|
||||
import net.yacy.kelondro.order.MicroDate;
|
||||
import net.yacy.kelondro.rwi.AbstractReference;
|
||||
import net.yacy.kelondro.rwi.Reference;
|
||||
|
||||
/**
|
||||
* this object stores attributes to URL references inside RWI collections
|
||||
*
|
||||
*/
|
||||
public final class ImageReferenceRow extends AbstractReference implements /*ImageReference,*/ Cloneable {
|
||||
|
||||
/**
|
||||
* object for termination of concurrent blocking queue processing
|
||||
*/
|
||||
public static final ImageReferenceRow poison = new ImageReferenceRow((Row.Entry) null);
|
||||
|
||||
|
||||
public static final Row urlEntryRow = new Row(new Column[]{
|
||||
new Column("h", Column.celltype_string, Column.encoder_bytes, Word.commonHashLength, "urlhash"),
|
||||
new Column("f", Column.celltype_cardinal, Column.encoder_b256, 4, "created"),
|
||||
new Column("m", Column.celltype_cardinal, Column.encoder_b256, 4, "modified"),
|
||||
new Column("s", Column.celltype_cardinal, Column.encoder_bytes, 4, "size-bytes"),
|
||||
new Column("d", Column.celltype_binary, Column.encoder_bytes, 1, "doctype"),
|
||||
new Column("q", Column.celltype_binary, Column.encoder_bytes, 1, "quality"),
|
||||
new Column("w", Column.celltype_cardinal, Column.encoder_b256, 2, "width"), // pixels
|
||||
new Column("i", Column.celltype_cardinal, Column.encoder_b256, 2, "height"), // pixels
|
||||
new Column("i", Column.celltype_cardinal, Column.encoder_b256, 2, "iso"), // iso number
|
||||
new Column("i", Column.celltype_cardinal, Column.encoder_b256, 2, "verschlusszeit"), // the x in 1/x
|
||||
new Column("i", Column.celltype_cardinal, Column.encoder_b256, 2, "blende"),
|
||||
new Column("i", Column.celltype_cardinal, Column.encoder_b256, 4, "distance"),
|
||||
new Column("o", Column.celltype_cardinal, Column.encoder_b256, 4, "author-id"), // author, creator, operator, camera-number
|
||||
new Column("o", Column.celltype_cardinal, Column.encoder_b256, 4, "group-id"), // may be also a crawl start identifier
|
||||
new Column("o", Column.celltype_cardinal, Column.encoder_b256, 4, "subgroupgroup-id"), // may be also a pages-in-crawl identifier
|
||||
new Column("o", Column.celltype_cardinal, Column.encoder_b256, 4, "counter-in-subgroup"), // may be also a counter of images on a page
|
||||
new Column("o", Column.celltype_cardinal, Column.encoder_b256, 4, "location-lon-x"),
|
||||
new Column("a", Column.celltype_cardinal, Column.encoder_b256, 4, "location-lat-y"),
|
||||
new Column("l", Column.celltype_cardinal, Column.encoder_b256, 4, "location-alt-h"),
|
||||
new Column("t", Column.celltype_string, Column.encoder_bytes, 4, "typeOfImage"), // a 4-stage taxonomy
|
||||
new Column("z", Column.celltype_bitfield, Column.encoder_bytes, 4, "flags"),
|
||||
new Column("r", Column.celltype_binary, Column.encoder_bytes, 3, "RGBAverage"),
|
||||
new Column("k", Column.celltype_cardinal, Column.encoder_b256, 1, "reserve")
|
||||
},
|
||||
Base64Order.enhancedCoder
|
||||
);
|
||||
// available chars: b,e,j,q
|
||||
|
||||
// static properties
|
||||
private static final int col_urlhash = 0; // h 12 the url hash b64-encoded
|
||||
private static final int col_lastModified = 1; // a 2 last-modified time of the document where word appears
|
||||
private static final int col_freshUntil = 2; // s 2 TTL for the word, so it can be removed easily if the TTL is short
|
||||
private static final int col_doctype = 6; // d 1 type of document
|
||||
private static final int col_urlLength = 10; // m 1 byte-length of complete URL
|
||||
private static final int col_urlComps = 11; // n 1 number of path components
|
||||
|
||||
// dynamic properties
|
||||
private static final int col_rgbaverage = 12; // g 6 an average of the RGB values
|
||||
private static final int col_typeofimage = 12; // g 4 classification
|
||||
private static final int col_flags = 13; // z 4 b64-encoded appearance flags (24 bit, see definition below)
|
||||
private static final int col_hitcount = 14; // c 1 number of occurrences of this word in text
|
||||
private static final int col_posintext = 15; // t 2 first appearance of word in text
|
||||
private static final int col_posinphrase = 16; // r 1 position of word in its phrase
|
||||
private static final int col_posofphrase = 17; // o 1 number of the phrase where word appears
|
||||
private static final int col_reserve1 = 18; // i 1 reserve1
|
||||
private static final int col_reserve2 = 19; // k 1 reserve2
|
||||
|
||||
// ideas for the classification bytes
|
||||
// 0 : content-type (person-portrait, persons-group, landscape, buildings, technical, artistical)
|
||||
// 1 : content-situation (a categorization of the type, like: person/standing, building/factory, artistical/cubistic)
|
||||
// 2 : content-category (a classification that is taken from the text environment by text analysis)
|
||||
// 3 :
|
||||
|
||||
private final Row.Entry entry;
|
||||
|
||||
public ImageReferenceRow(final String urlHash,
|
||||
final int urlLength, // byte-length of complete URL
|
||||
final int urlComps, // number of path components
|
||||
final int titleLength, // length of description/length (longer are better?)
|
||||
final int hitcount, // how often appears this word in the text
|
||||
final int wordcount, // total number of words
|
||||
final int phrasecount, // total number of phrases
|
||||
final int posintext, // position of word in all words
|
||||
final int posinphrase, // position of word in its phrase
|
||||
final int posofphrase, // number of the phrase where word appears
|
||||
final long lastmodified, // last-modified time of the document where word appears
|
||||
final long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
|
||||
final String language, // (guessed) language of document
|
||||
final char doctype, // type of document
|
||||
final int outlinksSame, // outlinks to same domain
|
||||
final int outlinksOther, // outlinks to other domain
|
||||
final Bitfield flags // attributes to the url and to the word according the url
|
||||
) {
|
||||
|
||||
assert (urlHash.length() == 12) : "urlhash = " + urlHash;
|
||||
this.entry = urlEntryRow.newEntry();
|
||||
final int mddlm = MicroDate.microDateDays(lastmodified);
|
||||
final int mddct = MicroDate.microDateDays(updatetime);
|
||||
this.entry.setCol(col_urlhash, urlHash, null);
|
||||
this.entry.setCol(col_lastModified, mddlm);
|
||||
this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation
|
||||
this.entry.setCol(col_doctype, new byte[]{(byte) doctype});
|
||||
this.entry.setCol(col_urlLength, urlLength);
|
||||
this.entry.setCol(col_urlComps, urlComps);
|
||||
this.entry.setCol(col_flags, flags.bytes());
|
||||
this.entry.setCol(col_hitcount, hitcount);
|
||||
this.entry.setCol(col_posintext, posintext);
|
||||
this.entry.setCol(col_posinphrase, posinphrase);
|
||||
this.entry.setCol(col_posofphrase, posofphrase);
|
||||
this.entry.setCol(col_reserve1, 0);
|
||||
this.entry.setCol(col_reserve2, 0);
|
||||
}
|
||||
|
||||
public ImageReferenceRow(final String urlHash,
|
||||
final int urlLength, // byte-length of complete URL
|
||||
final int urlComps, // number of path components
|
||||
final int titleLength, // length of description/length (longer are better?)
|
||||
final int wordcount, // total number of words
|
||||
final int phrasecount, // total number of phrases
|
||||
final long lastmodified, // last-modified time of the document where word appears
|
||||
final long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
|
||||
final String language, // (guessed) language of document
|
||||
final char doctype, // type of document
|
||||
final int outlinksSame, // outlinks to same domain
|
||||
final int outlinksOther // outlinks to other domain
|
||||
) {
|
||||
assert (urlHash.length() == 12) : "urlhash = " + urlHash;
|
||||
this.entry = urlEntryRow.newEntry();
|
||||
final int mddlm = MicroDate.microDateDays(lastmodified);
|
||||
final int mddct = MicroDate.microDateDays(updatetime);
|
||||
this.entry.setCol(col_urlhash, urlHash, null);
|
||||
this.entry.setCol(col_lastModified, mddlm);
|
||||
this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation
|
||||
this.entry.setCol(col_doctype, new byte[]{(byte) doctype});
|
||||
this.entry.setCol(col_urlLength, urlLength);
|
||||
this.entry.setCol(col_urlComps, urlComps);
|
||||
this.entry.setCol(col_reserve1, 0);
|
||||
this.entry.setCol(col_reserve2, 0);
|
||||
}
|
||||
|
||||
public ImageReferenceRow(final String urlHash, final String code) {
|
||||
// the code is the external form of the row minus the leading urlHash entry
|
||||
this.entry = urlEntryRow.newEntry((urlHash + code).getBytes());
|
||||
}
|
||||
|
||||
public ImageReferenceRow(final String external) {
|
||||
this.entry = urlEntryRow.newEntry(external, true);
|
||||
}
|
||||
|
||||
public ImageReferenceRow(final byte[] row) {
|
||||
this.entry = urlEntryRow.newEntry(row);
|
||||
}
|
||||
|
||||
public ImageReferenceRow(final byte[] row, final int offset, final boolean clone) {
|
||||
this.entry = urlEntryRow.newEntry(row, offset, clone);
|
||||
}
|
||||
|
||||
public ImageReferenceRow(final Row.Entry rentry) {
|
||||
// FIXME: see if cloning is necessary
|
||||
this.entry = rentry;
|
||||
}
|
||||
|
||||
public ImageReferenceRow clone() {
|
||||
final byte[] b = new byte[urlEntryRow.objectsize];
|
||||
System.arraycopy(entry.bytes(), 0, b, 0, urlEntryRow.objectsize);
|
||||
return new ImageReferenceRow(b);
|
||||
}
|
||||
|
||||
public String toPropertyForm() {
|
||||
return entry.toPropertyForm(true, true, false);
|
||||
}
|
||||
|
||||
public Entry toKelondroEntry() {
|
||||
return this.entry;
|
||||
}
|
||||
|
||||
public String metadataHash() {
|
||||
return this.entry.getColString(col_urlhash, null);
|
||||
}
|
||||
|
||||
public int virtualAge() {
|
||||
return (int) this.entry.getColLong(col_lastModified); // this is the time in MicoDateDays format
|
||||
}
|
||||
|
||||
public long lastModified() {
|
||||
return MicroDate.reverseMicroDateDays((int) this.entry.getColLong(col_lastModified));
|
||||
}
|
||||
|
||||
public long freshUntil() {
|
||||
return MicroDate.reverseMicroDateDays((int) this.entry.getColLong(col_freshUntil));
|
||||
}
|
||||
|
||||
public int hitcount() {
|
||||
return (int) this.entry.getColLong(col_hitcount);
|
||||
}
|
||||
|
||||
public int positions() {
|
||||
return 1;
|
||||
}
|
||||
|
||||
public int position(int p) {
|
||||
assert p == 0 : "p = " + p;
|
||||
return (int) this.entry.getColLong(col_posintext);
|
||||
}
|
||||
|
||||
public int posinphrase() {
|
||||
return (int) this.entry.getColLong(col_posinphrase);
|
||||
}
|
||||
|
||||
public int posofphrase() {
|
||||
return (int) this.entry.getColLong(col_posofphrase);
|
||||
}
|
||||
|
||||
|
||||
public char getType() {
|
||||
return (char) this.entry.getColByte(col_doctype);
|
||||
}
|
||||
|
||||
public int urllength() {
|
||||
return (int) this.entry.getColLong(col_urlLength);
|
||||
}
|
||||
|
||||
public int urlcomps() {
|
||||
return (int) this.entry.getColLong(col_urlComps);
|
||||
}
|
||||
|
||||
public Bitfield flags() {
|
||||
return new Bitfield(this.entry.getColBytes(col_flags));
|
||||
}
|
||||
|
||||
public String toString() {
|
||||
return toPropertyForm();
|
||||
}
|
||||
|
||||
public boolean isOlder(final Reference other) {
|
||||
if (other == null) return false;
|
||||
if (this.lastModified() < other.lastModified()) return true;
|
||||
return false;
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return this.metadataHash().hashCode();
|
||||
}
|
||||
|
||||
public void join(Reference oe) {
|
||||
throw new UnsupportedOperationException("");
|
||||
|
||||
}
|
||||
|
||||
}
|
@ -0,0 +1,365 @@
|
||||
// ImageReferenceVars.java
|
||||
// (C) 2010 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
|
||||
// first published 21.01.2010 on http://yacy.net
|
||||
//
|
||||
// This is a part of YaCy, a peer-to-peer based web search engine
|
||||
//
|
||||
// $LastChangedDate: 2009-03-20 16:44:59 +0100 (Fr, 20 Mrz 2009) $
|
||||
// $LastChangedRevision: 5736 $
|
||||
// $LastChangedBy: borg-0300 $
|
||||
//
|
||||
// LICENSE
|
||||
//
|
||||
// This program is free software; you can redistribute it and/or modify
|
||||
// it under the terms of the GNU General Public License as published by
|
||||
// the Free Software Foundation; either version 2 of the License, or
|
||||
// (at your option) any later version.
|
||||
//
|
||||
// This program is distributed in the hope that it will be useful,
|
||||
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
||||
// GNU General Public License for more details.
|
||||
//
|
||||
// You should have received a copy of the GNU General Public License
|
||||
// along with this program; if not, write to the Free Software
|
||||
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
|
||||
package net.yacy.kelondro.data.image;
|
||||
|
||||
import java.util.ArrayList;
|
||||
|
||||
import net.yacy.kelondro.index.Row.Entry;
|
||||
import net.yacy.kelondro.order.Bitfield;
|
||||
import net.yacy.kelondro.order.MicroDate;
|
||||
import net.yacy.kelondro.rwi.AbstractReference;
|
||||
import net.yacy.kelondro.rwi.Reference;
|
||||
|
||||
|
||||
public class ImageReferenceVars extends AbstractReference implements ImageReference, Reference, Cloneable {
|
||||
|
||||
/**
|
||||
* object for termination of concurrent blocking queue processing
|
||||
*/
|
||||
public static final ImageReferenceVars poison = new ImageReferenceVars();
|
||||
|
||||
|
||||
public Bitfield flags;
|
||||
public long lastModified;
|
||||
public String language, urlHash;
|
||||
public char type;
|
||||
public int hitcount, llocal, lother, phrasesintext,
|
||||
posinphrase, posofphrase,
|
||||
urlcomps, urllength, virtualAge,
|
||||
wordsintext, wordsintitle;
|
||||
ArrayList<Integer> positions;
|
||||
public double termFrequency;
|
||||
|
||||
public ImageReferenceVars(
|
||||
final String urlHash,
|
||||
final int urlLength, // byte-length of complete URL
|
||||
final int urlComps, // number of path components
|
||||
final int titleLength, // length of description/length (longer are better?)
|
||||
final int hitcount, // how often appears this word in the text
|
||||
final int wordcount, // total number of words
|
||||
final int phrasecount, // total number of phrases
|
||||
final ArrayList<Integer> ps, // positions of words that are joined into the reference
|
||||
final int posinphrase, // position of word in its phrase
|
||||
final int posofphrase, // number of the phrase where word appears
|
||||
final long lastmodified, // last-modified time of the document where word appears
|
||||
final long updatetime, // update time; this is needed to compute a TTL for the word, so it can be removed easily if the TTL is short
|
||||
String language, // (guessed) language of document
|
||||
final char doctype, // type of document
|
||||
final int outlinksSame, // outlinks to same domain
|
||||
final int outlinksOther, // outlinks to other domain
|
||||
final Bitfield flags, // attributes to the url and to the word according the url
|
||||
final double termfrequency
|
||||
) {
|
||||
if ((language == null) || (language.length() != 2)) language = "uk";
|
||||
final int mddlm = MicroDate.microDateDays(lastmodified);
|
||||
//final int mddct = MicroDate.microDateDays(updatetime);
|
||||
this.flags = flags;
|
||||
//this.freshUntil = Math.max(0, mddlm + (mddct - mddlm) * 2);
|
||||
this.lastModified = lastmodified;
|
||||
this.language = language;
|
||||
this.urlHash = urlHash;
|
||||
this.type = doctype;
|
||||
this.hitcount = hitcount;
|
||||
this.llocal = outlinksSame;
|
||||
this.lother = outlinksOther;
|
||||
this.phrasesintext = phrasecount;
|
||||
this.positions = new ArrayList<Integer>(ps.size());
|
||||
for (int i = 0; i < ps.size(); i++) this.positions.add(ps.get(i));
|
||||
this.posinphrase = posinphrase;
|
||||
this.posofphrase = posofphrase;
|
||||
this.urlcomps = urlComps;
|
||||
this.urllength = urlLength;
|
||||
this.virtualAge = mddlm;
|
||||
this.wordsintext = wordcount;
|
||||
this.wordsintitle = titleLength;
|
||||
this.termFrequency = termfrequency;
|
||||
}
|
||||
|
||||
public ImageReferenceVars(final ImageReference e) {
|
||||
this.flags = e.flags();
|
||||
//this.freshUntil = e.freshUntil();
|
||||
this.lastModified = e.lastModified();
|
||||
this.language = e.getLanguage();
|
||||
this.urlHash = e.metadataHash();
|
||||
this.type = e.getType();
|
||||
this.hitcount = e.hitcount();
|
||||
this.llocal = e.llocal();
|
||||
this.lother = e.lother();
|
||||
this.phrasesintext = e.phrasesintext();
|
||||
this.positions = new ArrayList<Integer>(e.positions());
|
||||
for (int i = 0; i < e.positions(); i++) this.positions.add(e.position(i));
|
||||
this.posinphrase = e.posinphrase();
|
||||
this.posofphrase = e.posofphrase();
|
||||
this.urlcomps = e.urlcomps();
|
||||
this.urllength = e.urllength();
|
||||
this.virtualAge = e.virtualAge();
|
||||
this.wordsintext = e.wordsintext();
|
||||
this.wordsintitle = e.wordsintitle();
|
||||
this.termFrequency = e.termFrequency();
|
||||
}
|
||||
|
||||
/**
|
||||
* initializer for special poison object
|
||||
*/
|
||||
public ImageReferenceVars() {
|
||||
this.flags = null;
|
||||
this.lastModified = 0;
|
||||
this.language = null;
|
||||
this.urlHash = null;
|
||||
this.type = ' ';
|
||||
this.hitcount = 0;
|
||||
this.llocal = 0;
|
||||
this.lother = 0;
|
||||
this.phrasesintext = 0;
|
||||
this.positions = null;
|
||||
this.posinphrase = 0;
|
||||
this.posofphrase = 0;
|
||||
this.urlcomps = 0;
|
||||
this.urllength = 0;
|
||||
this.virtualAge = 0;
|
||||
this.wordsintext = 0;
|
||||
this.wordsintitle = 0;
|
||||
this.termFrequency = 0.0;
|
||||
}
|
||||
|
||||
public ImageReferenceVars clone() {
|
||||
final ImageReferenceVars c = new ImageReferenceVars(
|
||||
this.urlHash,
|
||||
this.urllength,
|
||||
this.urlcomps,
|
||||
this.wordsintitle,
|
||||
this.hitcount,
|
||||
this.wordsintext,
|
||||
this.phrasesintext,
|
||||
this.positions,
|
||||
this.posinphrase,
|
||||
this.posofphrase,
|
||||
this.lastModified,
|
||||
System.currentTimeMillis(),
|
||||
this.language,
|
||||
this.type,
|
||||
this.llocal,
|
||||
this.lother,
|
||||
this.flags,
|
||||
this.termFrequency);
|
||||
return c;
|
||||
}
|
||||
|
||||
public void join(final ImageReferenceVars v) {
|
||||
// combine the distance
|
||||
this.positions.addAll(v.positions);
|
||||
this.posinphrase = (this.posofphrase == v.posofphrase) ? Math.min(this.posinphrase, v.posinphrase) : 0;
|
||||
this.posofphrase = Math.min(this.posofphrase, v.posofphrase);
|
||||
|
||||
// combine term frequency
|
||||
this.wordsintext = this.wordsintext + v.wordsintext;
|
||||
this.termFrequency = this.termFrequency + v.termFrequency;
|
||||
}
|
||||
|
||||
public Bitfield flags() {
|
||||
return flags;
|
||||
}
|
||||
/*
|
||||
public long freshUntil() {
|
||||
return freshUntil;
|
||||
}
|
||||
*/
|
||||
public String getLanguage() {
|
||||
return language;
|
||||
}
|
||||
|
||||
public char getType() {
|
||||
return type;
|
||||
}
|
||||
|
||||
public int hitcount() {
|
||||
return hitcount;
|
||||
}
|
||||
|
||||
public boolean isOlder(final Reference other) {
|
||||
assert false; // should not be used
|
||||
return false;
|
||||
}
|
||||
|
||||
public long lastModified() {
|
||||
return lastModified;
|
||||
}
|
||||
|
||||
public int llocal() {
|
||||
return llocal;
|
||||
}
|
||||
|
||||
public int lother() {
|
||||
return lother;
|
||||
}
|
||||
|
||||
public int phrasesintext() {
|
||||
return phrasesintext;
|
||||
}
|
||||
|
||||
public int posinphrase() {
|
||||
return posinphrase;
|
||||
}
|
||||
|
||||
public int positions() {
|
||||
return this.positions.size();
|
||||
}
|
||||
|
||||
public int position(int p) {
|
||||
return this.positions.get(p);
|
||||
}
|
||||
|
||||
public int posofphrase() {
|
||||
return posofphrase;
|
||||
}
|
||||
|
||||
public ImageReferenceRow toRowEntry() {
|
||||
return new ImageReferenceRow(
|
||||
urlHash,
|
||||
urllength, // byte-length of complete URL
|
||||
urlcomps, // number of path components
|
||||
wordsintitle, // length of description/length (longer are better?)
|
||||
hitcount, // how often appears this word in the text
|
||||
wordsintext, // total number of words
|
||||
phrasesintext, // total number of phrases
|
||||
positions.get(0), // position of word in all words
|
||||
posinphrase, // position of word in its phrase
|
||||
posofphrase, // number of the phrase where word appears
|
||||
lastModified, // last-modified time of the document where word appears
|
||||
System.currentTimeMillis(), // update time;
|
||||
language, // (guessed) language of document
|
||||
type, // type of document
|
||||
llocal, // outlinks to same domain
|
||||
lother, // outlinks to other domain
|
||||
flags // attributes to the url and to the word according the url
|
||||
);
|
||||
}
|
||||
|
||||
public Entry toKelondroEntry() {
|
||||
return toRowEntry().toKelondroEntry();
|
||||
}
|
||||
|
||||
public String toPropertyForm() {
|
||||
return toRowEntry().toPropertyForm();
|
||||
}
|
||||
|
||||
public String metadataHash() {
|
||||
return urlHash;
|
||||
}
|
||||
|
||||
public int urlcomps() {
|
||||
return urlcomps;
|
||||
}
|
||||
|
||||
public int urllength() {
|
||||
return urllength;
|
||||
}
|
||||
|
||||
public int virtualAge() {
|
||||
return virtualAge;
|
||||
}
|
||||
|
||||
public int wordsintext() {
|
||||
return wordsintext;
|
||||
}
|
||||
|
||||
public int wordsintitle() {
|
||||
return wordsintitle;
|
||||
}
|
||||
|
||||
public double termFrequency() {
|
||||
if (this.termFrequency == 0.0) this.termFrequency = (((double) this.hitcount()) / ((double) (this.wordsintext() + this.wordsintitle() + 1)));
|
||||
return this.termFrequency;
|
||||
}
|
||||
|
||||
public final void min(final ImageReferenceVars other) {
|
||||
if (other == null) return;
|
||||
int v;
|
||||
long w;
|
||||
double d;
|
||||
if (this.hitcount > (v = other.hitcount)) this.hitcount = v;
|
||||
if (this.llocal > (v = other.llocal)) this.llocal = v;
|
||||
if (this.lother > (v = other.lother)) this.lother = v;
|
||||
if (this.virtualAge > (v = other.virtualAge)) this.virtualAge = v;
|
||||
if (this.wordsintext > (v = other.wordsintext)) this.wordsintext = v;
|
||||
if (this.phrasesintext > (v = other.phrasesintext)) this.phrasesintext = v;
|
||||
this.positions = a(Math.min(min(this.positions), min(other.positions)));
|
||||
if (this.posinphrase > (v = other.posinphrase)) this.posinphrase = v;
|
||||
if (this.posofphrase > (v = other.posofphrase)) this.posofphrase = v;
|
||||
if (this.lastModified > (w = other.lastModified)) this.lastModified = w;
|
||||
//if (this.freshUntil > (w = other.freshUntil)) this.freshUntil = w;
|
||||
if (this.urllength > (v = other.urllength)) this.urllength = v;
|
||||
if (this.urlcomps > (v = other.urlcomps)) this.urlcomps = v;
|
||||
if (this.wordsintitle > (v = other.wordsintitle)) this.wordsintitle = v;
|
||||
if (this.termFrequency > (d = other.termFrequency)) this.termFrequency = d;
|
||||
}
|
||||
|
||||
public final void max(final ImageReferenceVars other) {
|
||||
if (other == null) return;
|
||||
int v;
|
||||
long w;
|
||||
double d;
|
||||
if (this.hitcount < (v = other.hitcount)) this.hitcount = v;
|
||||
if (this.llocal < (v = other.llocal)) this.llocal = v;
|
||||
if (this.lother < (v = other.lother)) this.lother = v;
|
||||
if (this.virtualAge < (v = other.virtualAge)) this.virtualAge = v;
|
||||
if (this.wordsintext < (v = other.wordsintext)) this.wordsintext = v;
|
||||
if (this.phrasesintext < (v = other.phrasesintext)) this.phrasesintext = v;
|
||||
this.positions = a(Math.max(max(this.positions), max(other.positions)));
|
||||
if (this.posinphrase < (v = other.posinphrase)) this.posinphrase = v;
|
||||
if (this.posofphrase < (v = other.posofphrase)) this.posofphrase = v;
|
||||
if (this.lastModified < (w = other.lastModified)) this.lastModified = w;
|
||||
//if (this.freshUntil < (w = other.freshUntil)) this.freshUntil = w;
|
||||
if (this.urllength < (v = other.urllength)) this.urllength = v;
|
||||
if (this.urlcomps < (v = other.urlcomps)) this.urlcomps = v;
|
||||
if (this.wordsintitle < (v = other.wordsintitle)) this.wordsintitle = v;
|
||||
if (this.termFrequency < (d = other.termFrequency)) this.termFrequency = d;
|
||||
}
|
||||
|
||||
public void join(final Reference r) {
|
||||
// joins two entries into one entry
|
||||
|
||||
// combine the distance
|
||||
ImageReference oe = (ImageReference) r;
|
||||
for (int i = 0; i < r.positions(); i++) this.positions.add(r.position(i));
|
||||
this.posinphrase = (this.posofphrase == oe.posofphrase()) ? Math.min(this.posinphrase, oe.posinphrase()) : 0;
|
||||
this.posofphrase = Math.min(this.posofphrase, oe.posofphrase());
|
||||
|
||||
// combine term frequency
|
||||
this.termFrequency = this.termFrequency + oe.termFrequency();
|
||||
this.wordsintext = this.wordsintext + oe.wordsintext();
|
||||
}
|
||||
|
||||
public int hashCode() {
|
||||
return this.urlHash.hashCode();
|
||||
}
|
||||
|
||||
public void addPosition(int position) {
|
||||
this.positions.add(position);
|
||||
}
|
||||
|
||||
}
|
Loading…
Reference in new issue