added a web structure computation and logging:

- all web page parsing operations will now increase a web structure file
- the file is computed in memory and dumped at shutdown-time to PLASMASB/webStructure.map in readable form (not a database)
- the file can be used externally to analyse the link structure of the crawled pages
- the web structure can also be retrieved using a xml-interface at http://localhost:8080/xml/webstructure.xml
- the short-term purpose is the computation of a link-graph image (before linuxtag!)
- a long-term purpose could be a decentralized computation of the citation rank



git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@3746 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 18 years ago
parent cd3494a04c
commit 33ad0c8246

@ -3,7 +3,7 @@ javacSource=1.4
javacTarget=1.4
# Release Configuration
releaseVersion=0.521
releaseVersion=0.522
releaseFile=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
#releaseFile=yacy_v${releaseVersion}_${DSTAMP}_${releaseNr}.tar.gz
releaseDir=yacy_dev_v${releaseVersion}_${DSTAMP}_${releaseNr}

@ -0,0 +1,51 @@
package xml;
import java.util.Iterator;
import java.util.Map;
import de.anomic.http.httpHeader;
import de.anomic.plasma.plasmaSwitchboard;
import de.anomic.plasma.plasmaWebStructure;
import de.anomic.server.serverObjects;
import de.anomic.server.serverSwitch;
public class webstructure {
public static serverObjects respond(httpHeader header, serverObjects post, serverSwitch env) {
serverObjects prop = new serverObjects();
plasmaSwitchboard sb = (plasmaSwitchboard) env;
plasmaWebStructure structure = sb.webStructure;
Iterator i = structure.structureEntryIterator();
int c = 0, d;
plasmaWebStructure.structureEntry sentry;
Map.Entry refentry;
String refdom, refhash;
Integer refcount;
Iterator k;
while (i.hasNext()) {
sentry = (plasmaWebStructure.structureEntry) i.next();
prop.put("domains_" + c + "_hash", sentry.domhash);
prop.put("domains_" + c + "_domain", sentry.domain);
prop.put("domains_" + c + "_date", sentry.date);
k = sentry.references.entrySet().iterator();
d = 0;
refloop: while (k.hasNext()) {
refentry = (Map.Entry) k.next();
refhash = (String) refentry.getKey();
refdom = structure.resolveDomHash2DomString(refhash);
if (refdom == null) continue refloop;
prop.put("domains_" + c + "_citations_" + d + "_refhash", refhash);
prop.put("domains_" + c + "_citations_" + d + "_refdom", refdom);
refcount = (Integer) refentry.getValue();
prop.put("domains_" + c + "_citations_" + d + "_refcount", refcount.intValue());
d++;
}
prop.put("domains_" + c + "_citations", d);
c++;
}
prop.put("domains", c);
// return rewrite properties
return prop;
}
}

@ -0,0 +1,12 @@
<?xml version="1.0"?>
<webstructure>
<domains reference="forward" count="#[domains]#">
#{domains}#
<domain host="#[domain]#" id="#[hash]#" date="#[date]#">
#{citations}#
<citation host="#[refdom]#" id="#[refhash]#" count="#[refcount]#" />
#{/citations}#
</domain>
#{/domains}#
</domains>
</webstructure>

@ -66,7 +66,6 @@ import java.util.StringTokenizer;
import de.anomic.data.htmlTools;
import de.anomic.data.userDB;
import de.anomic.data.wikiCode;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaSwitchboard;

@ -139,7 +139,6 @@ import de.anomic.index.indexContainer;
import de.anomic.index.indexRWIEntry;
import de.anomic.plasma.plasmaURL;
import de.anomic.index.indexURLEntry;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.kelondro.kelondroBitfield;
import de.anomic.kelondro.kelondroCache;
import de.anomic.kelondro.kelondroException;
@ -153,7 +152,6 @@ import de.anomic.plasma.parser.ParserException;
import de.anomic.plasma.urlPattern.defaultURLPattern;
import de.anomic.plasma.urlPattern.plasmaURLPattern;
import de.anomic.server.serverAbstractSwitch;
import de.anomic.server.serverDate;
import de.anomic.server.serverFileUtils;
import de.anomic.server.serverInstantThread;
import de.anomic.server.serverObjects;
@ -175,9 +173,6 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public static int indexingSlots = 30;
public static int stackCrawlSlots = 1000000;
public static int maxCRLDump = 500000;
public static int maxCRGDump = 200000;
private int dhtTransferIndexCount = 50;
// we must distinguish the following cases: resource-load was initiated by
@ -241,8 +236,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
public HashMap indexingTasksInProcess;
public userDB userDB;
public bookmarksDB bookmarksDB;
//public StringBuffer crl; // local citation references
public StringBuffer crg; // global citation references
public plasmaWebStructure webStructure;
public dbImportManager dbImportManager;
public plasmaDHTFlush transferIdxThread = null;
private plasmaDHTChunk dhtTransferChunk = null;
@ -910,8 +904,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
// setting timestamp of last proxy access
this.proxyLastAccess = System.currentTimeMillis() - 60000;
crg = new StringBuffer(maxCRGDump);
//crl = new StringBuffer(maxCRLDump);
this.webStructure = new plasmaWebStructure(log, rankingPath, "LOCAL/010_cr/", getConfig("CRDist0Path", plasmaRankingDistribution.CR_OWN), new File(plasmaPath, "webStructure.map"));
// configuring list path
if (!(listsPath.exists())) listsPath.mkdirs();
@ -1678,7 +1671,8 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
parser.close();
cacheManager.close();
sbQueue.close();
flushCitationReference(crg, "crg");
webStructure.flushCitationReference("crg");
webStructure.close();
log.logConfig("SWITCHBOARD SHUTDOWN STEP 3: sending termination signal to database manager (stand by...)");
noticeURL.close();
delegatedURL.close();
@ -2298,7 +2292,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
plasmaCondenser condenser = new plasmaCondenser(document, entry.profile().indexText(), entry.profile().indexMedia());
// generate citation reference
Integer[] ioLinks = generateCitationReference(entry.urlHash(), docDate, document, condenser); // [outlinksSame, outlinksOther]
Integer[] ioLinks = webStructure.generateCitationReference(entry.url(), entry.urlHash(), docDate, document, condenser); // [outlinksSame, outlinksOther]
try {
// check for interruption
@ -2564,84 +2558,7 @@ public final class plasmaSwitchboard extends serverAbstractSwitch implements ser
if (document != null) try { document.close(); } catch (Exception e) { /* ignore this */ }
}
}
private Integer[] /*(outlinksSame, outlinksOther)*/ generateCitationReference(String baseurlhash, Date docDate, plasmaParserDocument document, plasmaCondenser condenser) {
// generate citation reference
Map hl = document.getHyperlinks();
Iterator it = hl.entrySet().iterator();
String nexturlhash;
StringBuffer cpg = new StringBuffer(12 * (hl.size() + 1) + 1);
StringBuffer cpl = new StringBuffer(12 * (hl.size() + 1) + 1);
String lhp = baseurlhash.substring(6); // local hash part
int GCount = 0;
int LCount = 0;
while (it.hasNext()) {
nexturlhash = plasmaURL.urlHash((String) ((Map.Entry) it.next()).getKey());
if (nexturlhash != null) {
if (nexturlhash.substring(6).equals(lhp)) {
cpl.append(nexturlhash.substring(0, 6));
LCount++;
} else {
cpg.append(nexturlhash);
GCount++;
}
}
}
// append this reference to buffer
// generate header info
String head = baseurlhash + "=" +
plasmaWordIndex.microDateHoursStr(docDate.getTime()) + // latest update timestamp of the URL
plasmaWordIndex.microDateHoursStr(System.currentTimeMillis()) + // last visit timestamp of the URL
kelondroBase64Order.enhancedCoder.encodeLongSmart(LCount, 2) + // count of links to local resources
kelondroBase64Order.enhancedCoder.encodeLongSmart(GCount, 2) + // count of links to global resources
kelondroBase64Order.enhancedCoder.encodeLongSmart(document.getImages().size(), 2) + // count of Images in document
kelondroBase64Order.enhancedCoder.encodeLongSmart(0, 2) + // count of links to other documents
kelondroBase64Order.enhancedCoder.encodeLongSmart(document.getTextLength(), 3) + // length of plain text in bytes
kelondroBase64Order.enhancedCoder.encodeLongSmart(condenser.RESULT_NUMB_WORDS, 3) + // count of all appearing words
kelondroBase64Order.enhancedCoder.encodeLongSmart(condenser.words().size(), 3) + // count of all unique words
kelondroBase64Order.enhancedCoder.encodeLongSmart(0, 1); // Flags (update, popularity, attention, vote)
//crl.append(head); crl.append ('|'); crl.append(cpl); crl.append((char) 13); crl.append((char) 10);
crg.append(head); crg.append('|'); crg.append(cpg); crg.append((char) 13); crg.append((char) 10);
// if buffer is full, flush it.
/*
if (crl.length() > maxCRLDump) {
flushCitationReference(crl, "crl");
crl = new StringBuffer(maxCRLDump);
}
**/
if (crg.length() > maxCRGDump) {
flushCitationReference(crg, "crg");
crg = new StringBuffer(maxCRGDump);
}
return new Integer[] {new Integer(LCount), new Integer(GCount)};
}
private void flushCitationReference(StringBuffer cr, String type) {
if (cr.length() < 12) return;
String filename = type.toUpperCase() + "-A-" + new serverDate().toShortString(true) + "." + cr.substring(0, 12) + ".cr.gz";
File path = new File(rankingPath, (type.equals("crl") ? "LOCAL/010_cr/" : getConfig("CRDist0Path", plasmaRankingDistribution.CR_OWN)));
path.mkdirs();
File file = new File(path, filename);
// generate header
StringBuffer header = new StringBuffer(200);
header.append("# Name=YaCy " + ((type.equals("crl")) ? "Local" : "Global") + " Citation Reference Ticket"); header.append((char) 13); header.append((char) 10);
header.append("# Created=" + System.currentTimeMillis()); header.append((char) 13); header.append((char) 10);
header.append("# Structure=<Referee-12>,'=',<UDate-3>,<VDate-3>,<LCount-2>,<GCount-2>,<ICount-2>,<DCount-2>,<TLength-3>,<WACount-3>,<WUCount-3>,<Flags-1>,'|',*<Anchor-" + ((type.equals("crl")) ? "6" : "12") + ">"); header.append((char) 13); header.append((char) 10);
header.append("# ---"); header.append((char) 13); header.append((char) 10);
cr.insert(0, header.toString());
try {
serverFileUtils.writeAndGZip(cr.toString().getBytes(), file);
log.logFine("wrote citation reference dump " + file.toString());
} catch (IOException e) {
e.printStackTrace();
}
}
private void processLocalCrawling(plasmaCrawlEntry urlEntry, plasmaCrawlProfile.entry profile, String stats) {
// work off one Crawl stack entry
if ((urlEntry == null) || (urlEntry.url() == null)) {

@ -552,18 +552,6 @@ public class plasmaURL {
return domDomain(urlhash) != 7;
}
public static final String oldurlHash(URL url) {
if (url == null) return null;
String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(url.toNormalform())).substring(0, yacySeedDB.commonHashLength);
return hash;
}
public static final String oldurlHash(String url) throws MalformedURLException {
if ((url == null) || (url.length() < 10)) return null;
String hash = kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(new URL(url).toNormalform())).substring(0, yacySeedDB.commonHashLength);
return hash;
}
public static final serverByteBuffer compressIndex(indexContainer inputContainer, indexContainer excludeContainer, long maxtime) {
// collect references according to domains
long timeout = (maxtime < 0) ? Long.MAX_VALUE : System.currentTimeMillis() + maxtime;

@ -0,0 +1,307 @@
// plasmaWebStructure.java
// -----------------------------
// (C) 2007 by Michael Peter Christen; mc@anomic.de, Frankfurt a. M., Germany
// first published 15.05.2007 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate: 2006-04-02 22:40:07 +0200 (So, 02 Apr 2006) $
// $LastChangedRevision: 1986 $
// $LastChangedBy: orbiter $
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package de.anomic.plasma;
import java.io.File;
import java.io.IOException;
import java.util.Date;
import java.util.Iterator;
import java.util.Map;
import java.util.SortedMap;
import java.util.TreeMap;
import de.anomic.kelondro.kelondroBase64Order;
import de.anomic.net.URL;
import de.anomic.server.serverDate;
import de.anomic.server.serverFileUtils;
import de.anomic.server.logging.serverLog;
public class plasmaWebStructure {
public static int maxCRLDump = 500000;
public static int maxCRGDump = 200000;
private StringBuffer crg; // global citation references
private serverLog log;
private File rankingPath, structureFile;
private String crlFile, crgFile;
private TreeMap structure; // String2String with <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*
public plasmaWebStructure(serverLog log, File rankingPath, String crlFile, String crgFile, File structureFile) {
this.log = log;
this.rankingPath = rankingPath;
this.crlFile = crlFile;
this.crgFile = crgFile;
this.crg = new StringBuffer(maxCRGDump);
this.structure = new TreeMap();
this.structureFile = structureFile;
Map loadedStructure = serverFileUtils.loadHashMap(this.structureFile);
if (loadedStructure != null) this.structure.putAll(loadedStructure);
}
public Integer[] /*(outlinksSame, outlinksOther)*/ generateCitationReference(URL url, String baseurlhash, Date docDate, plasmaParserDocument document, plasmaCondenser condenser) {
assert plasmaURL.urlHash(url).equals(baseurlhash);
// generate citation reference
Map hl = document.getHyperlinks();
Iterator it = hl.entrySet().iterator();
String nexturlhash;
StringBuffer cpg = new StringBuffer(12 * (hl.size() + 1) + 1);
StringBuffer cpl = new StringBuffer(12 * (hl.size() + 1) + 1);
String lhp = baseurlhash.substring(6); // local hash part
int GCount = 0;
int LCount = 0;
while (it.hasNext()) {
nexturlhash = plasmaURL.urlHash((String) ((Map.Entry) it.next()).getKey());
if (nexturlhash != null) {
if (nexturlhash.substring(6).equals(lhp)) {
// this is a inbound link
cpl.append(nexturlhash.substring(0, 6)); // store only local part
LCount++;
} else {
// this is a outbound link
cpg.append(nexturlhash); // store complete hash
GCount++;
}
}
}
// append this reference to buffer
// generate header info
String head = baseurlhash + "=" +
plasmaWordIndex.microDateHoursStr(docDate.getTime()) + // latest update timestamp of the URL
plasmaWordIndex.microDateHoursStr(System.currentTimeMillis()) + // last visit timestamp of the URL
kelondroBase64Order.enhancedCoder.encodeLongSmart(LCount, 2) + // count of links to local resources
kelondroBase64Order.enhancedCoder.encodeLongSmart(GCount, 2) + // count of links to global resources
kelondroBase64Order.enhancedCoder.encodeLongSmart(document.getImages().size(), 2) + // count of Images in document
kelondroBase64Order.enhancedCoder.encodeLongSmart(0, 2) + // count of links to other documents
kelondroBase64Order.enhancedCoder.encodeLongSmart(document.getTextLength(), 3) + // length of plain text in bytes
kelondroBase64Order.enhancedCoder.encodeLongSmart(condenser.RESULT_NUMB_WORDS, 3) + // count of all appearing words
kelondroBase64Order.enhancedCoder.encodeLongSmart(condenser.words().size(), 3) + // count of all unique words
kelondroBase64Order.enhancedCoder.encodeLongSmart(0, 1); // Flags (update, popularity, attention, vote)
//crl.append(head); crl.append ('|'); crl.append(cpl); crl.append((char) 13); crl.append((char) 10);
crg.append(head); crg.append('|'); crg.append(cpg); crg.append((char) 13); crg.append((char) 10);
learn(url, cpg);
// if buffer is full, flush it.
/*
if (crl.length() > maxCRLDump) {
flushCitationReference(crl, "crl");
crl = new StringBuffer(maxCRLDump);
}
**/
if (crg.length() > maxCRGDump) {
flushCitationReference("crg");
crg = new StringBuffer(maxCRGDump);
}
return new Integer[] {new Integer(LCount), new Integer(GCount)};
}
public void flushCitationReference(String type) {
if (crg.length() < 12) return;
String filename = type.toUpperCase() + "-A-" + new serverDate().toShortString(true) + "." + crg.substring(0, 12) + ".cr.gz";
File path = new File(rankingPath, (type.equals("crl")) ? crlFile : crgFile);
path.mkdirs();
File file = new File(path, filename);
// generate header
StringBuffer header = new StringBuffer(200);
header.append("# Name=YaCy " + ((type.equals("crl")) ? "Local" : "Global") + " Citation Reference Ticket"); header.append((char) 13); header.append((char) 10);
header.append("# Created=" + System.currentTimeMillis()); header.append((char) 13); header.append((char) 10);
header.append("# Structure=<Referee-12>,'=',<UDate-3>,<VDate-3>,<LCount-2>,<GCount-2>,<ICount-2>,<DCount-2>,<TLength-3>,<WACount-3>,<WUCount-3>,<Flags-1>,'|',*<Anchor-" + ((type.equals("crl")) ? "6" : "12") + ">"); header.append((char) 13); header.append((char) 10);
header.append("# ---"); header.append((char) 13); header.append((char) 10);
crg.insert(0, header.toString());
try {
serverFileUtils.writeAndGZip(crg.toString().getBytes(), file);
log.logFine("wrote citation reference dump " + file.toString());
} catch (IOException e) {
e.printStackTrace();
}
}
public static TreeMap refstr2map(String refs) {
if ((refs == null) || (refs.length() <= 8)) return new TreeMap();
TreeMap map = new TreeMap();
String c;
assert (refs.length() - 8) % 10 == 0;
int refsc = (refs.length() - 8) / 10;
for (int i = 0; i < refsc; i++) {
c = refs.substring(8 + i * 10, 8 + (i + 1) * 10);
map.put(c.substring(0, 6), new Integer(Integer.parseInt(c.substring(6), 16)));
}
return map;
}
private static String map2refstr(TreeMap map) {
StringBuffer s = new StringBuffer(map.size() * 10);
s.append(plasmaURL.shortDayFormatter.format(new Date()));
Iterator i = map.entrySet().iterator();
Map.Entry entry;
String h;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
s.append((String) entry.getKey());
h = Integer.toHexString(((Integer) entry.getValue()).intValue());
if (h.length() == 0) {
s.append("0000");
} else if (h.length() == 1) {
s.append("000").append(h);
} else if (h.length() == 2) {
s.append("00").append(h);
} else if (h.length() == 3) {
s.append('0').append(h);
} else if (h.length() == 4) {
s.append(h);
} else {
s.append("FFFF");
}
}
return s.toString();
}
public TreeMap references(String domhash) {
assert domhash.length() == 6;
SortedMap tailMap = structure.tailMap(domhash);
if ((tailMap == null) || (tailMap.size() == 0)) return new TreeMap();
String key = (String) tailMap.firstKey();
if (key.startsWith(domhash)) {
return refstr2map((String) tailMap.get(key));
} else {
return new TreeMap();
}
}
public String resolveDomHash2DomString(String domhash) {
// returns the domain as string, null if unknown
assert domhash.length() == 6;
SortedMap tailMap = structure.tailMap(domhash);
if ((tailMap == null) || (tailMap.size() == 0)) return null;
String key = (String) tailMap.firstKey();
if (key.startsWith(domhash)) {
return key.substring(7);
} else {
return null;
}
}
private void learn(URL url, StringBuffer reference /*string of b64(12digits)-hashes*/) {
String domhash = plasmaURL.urlHash(url).substring(6);
TreeMap refs = references(domhash);
assert reference.length() % 12 == 0;
String dom;
int c;
for (int i = 0; i < reference.length() / 12; i++) {
dom = reference.substring(i * 12 + 6, (i + 1) * 12);
c = 0;
if (refs.containsKey(dom)) {
c = ((Integer) refs.get(dom)).intValue();
}
refs.put(dom, new Integer(++c));
}
structure.put(domhash + "," + url.getHost(), map2refstr(refs));
}
public void saveWebStructure() {
try {
serverFileUtils.saveMap(this.structureFile, this.structure, "Web Structure Syntax: <b64hash(6)>','<host> to <date-yyyymmdd(8)>{<target-b64hash(6)><target-count-hex(4)>}*");
} catch (IOException e) {
e.printStackTrace();
}
}
public Iterator structureEntryIterator() {
// iterates objects of type structureEntry
return new structureIterator();
}
public class structureIterator implements Iterator {
private Iterator i;
private structureEntry nextentry;
public structureIterator() {
i = structure.entrySet().iterator();
next0();
}
public boolean hasNext() {
return nextentry != null;
}
private void next0() {
Map.Entry entry = null;
String dom = null, ref;
while (i.hasNext()) {
entry = (Map.Entry) i.next();
dom = (String) entry.getKey();
if (dom.length() >= 8) break;
if (!i.hasNext()) {
nextentry = null;
return;
}
}
if ((entry == null) || (dom == null)) {
nextentry = null;
return;
}
ref = (String) entry.getValue();
nextentry = new structureEntry(dom.substring(0, 6), dom.substring(7), ref.substring(0, 8), refstr2map(ref));
}
public Object next() {
structureEntry r = nextentry;
next0();
return r;
}
public void remove() {
throw new UnsupportedOperationException("not implemented");
}
}
public class structureEntry {
public String domhash, domain, date;
public Map references;
public structureEntry(String domhash, String domain, String date, Map references) {
this.domhash = domhash;
this.domain = domain;
this.date = date;
this.references = references;
}
}
public void close() {
log.logInfo("Saving Web Structure File");
saveWebStructure();
}
}

@ -45,7 +45,6 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.Properties;
import java.util.logging.Level;
import java.util.logging.LogManager;
import java.util.logging.Logger;

@ -49,7 +49,6 @@ import org.apache.axis.AxisFault;
import org.w3c.dom.Document;
import de.anomic.data.htmlTools;
import de.anomic.data.wikiCode;
import de.anomic.plasma.plasmaURL;
import de.anomic.net.URL;
import de.anomic.plasma.plasmaSearchPreOrder;

Loading…
Cancel
Save