bug fixes and code cleaning

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@22 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 20 years ago
parent c13411c198
commit b9203bdb50

@ -307,7 +307,7 @@ public class IndexCreate_p {
prop.put("indexing-queue_list_"+i+"_modified", daydate(pcentry.lastModified));
prop.put("indexing-queue_list_"+i+"_href",((pcentry.scraper == null) ? "0" : ("" + pcentry.scraper.getAnchors().size())));
prop.put("indexing-queue_list_"+i+"_anchor", ((pcentry.scraper == null) ? "-" : pcentry.scraper.getHeadline()) );
prop.put("indexing-queue_list_"+i+"_url", pcentry.urlString);
prop.put("indexing-queue_list_"+i+"_url", pcentry.nomalizedURLString);
dark = !dark;
}
}

@ -456,7 +456,7 @@ public class dir {
public static void deletePhrase(plasmaSwitchboard switchboard, String urlstring, String phrase, String descr) {
try {
String urlhash = plasmaURL.urlHash(new URL(urlstring));
Set words = plasmaSwitchboard.getWords(("yacyshare " + phrase + " " + descr).getBytes());
Set words = plasmaCondenser.getWords(("yacyshare " + phrase + " " + descr).getBytes());
switchboard.removeReferences(urlhash, words);
switchboard.loadedURL.remove(urlhash);
} catch (Exception e) {

@ -122,7 +122,10 @@ public class transferRWI {
Iterator it = unknownURL.iterator();
while (it.hasNext()) unknownURLs += "," + (String) it.next();
if (unknownURLs.length() > 0) unknownURLs = unknownURLs.substring(1);
switchboard.log.logInfo("Received " + received + " Words [" + wordhashes[0] + " .. " + wordhashes[wordhashes.length - 1] + "] from peer " + iam + ", requested " + unknownURL.size() + " URL's");
if (wordhashes.length == 0)
switchboard.log.logInfo("Received 0 Words from peer " + iam + ", requested " + unknownURL.size() + " URL's");
else
switchboard.log.logInfo("Received " + received + " Words [" + wordhashes[0] + " .. " + wordhashes[wordhashes.length - 1] + "] from peer " + iam + ", requested " + unknownURL.size() + " URL's");
result = "ok";
} else {
result = "error_not_granted";

@ -583,6 +583,17 @@ public class plasmaCondenser {
}
}
public static Set getWords(byte[] text) {
if (text == null) return null;
ByteArrayInputStream buffer = new ByteArrayInputStream(text);
try {
plasmaCondenser condenser = new plasmaCondenser(buffer);
return condenser.getWords();
} catch (IOException e) {
return null;
}
}
public static void main(String[] args) {
if ((args.length == 0) || (args.length > 3)) System.out.println("wrong number of arguments: plasmaCondenser -text|-html <infile> <outfile>"); else try {

@ -238,7 +238,7 @@ public class plasmaHTCache {
if ((entry.status == CACHE_FILL) ||
(entry.status == CACHE_STALE_RELOAD_GOOD) ||
(entry.status == CACHE_STALE_RELOAD_BAD)) {
responseHeaderDB.set(entry.urlHash, entry.responseHeader);
responseHeaderDB.set(entry.nomalizedURLHash, entry.responseHeader);
}
// work off unwritten files and undone parsing
@ -254,7 +254,7 @@ public class plasmaHTCache {
}
entry.cacheFile.getParentFile().mkdirs();
serverFileUtils.write(entry.cacheArray, entry.cacheFile);
entry.cacheArray = null;
//entry.cacheArray = null;
} catch (FileNotFoundException e) {
// this is the case of a "(Not a directory)" error, which should be prohibited
// by the shallStoreCache() property. However, sometimes the error still occurs
@ -444,8 +444,8 @@ public class plasmaHTCache {
public File cacheFile; // the cache file
public byte[] cacheArray; // or the cache as byte-array
public URL url;
public String urlHash;
public String urlString;
public String nomalizedURLHash;
public String nomalizedURLString;
public int status; // cache load/hit/stale etc status
public Date lastModified;
public char doctype;
@ -462,15 +462,15 @@ public class plasmaHTCache {
plasmaCrawlProfile.entry profile) {
// normalize url
this.urlString = htmlFilterContentScraper.urlNormalform(url);
this.nomalizedURLString = htmlFilterContentScraper.urlNormalform(url);
try {
this.url = new URL(urlString);
this.url = new URL(nomalizedURLString);
} catch (MalformedURLException e) {
System.out.println("internal error at httpdProxyCache.Entry: " + e);
System.exit(-1);
}
this.cacheFile = getCachePath(this.url);
this.urlHash = plasmaCrawlLURL.urlHash(urlString);
this.nomalizedURLHash = plasmaCrawlLURL.urlHash(nomalizedURLString);
// assigned:
this.initDate = initDate;
@ -496,7 +496,7 @@ public class plasmaHTCache {
lastModified = responseHeader.lastModified();
if (lastModified == null) lastModified = new Date(); // does not exist in header
}
this.doctype = plasmaWordIndexEntry.docType(urlString);
this.doctype = plasmaWordIndexEntry.docType(nomalizedURLString);
this.language = plasmaWordIndexEntry.language(url);
// to be defined later:
@ -554,8 +554,8 @@ public class plasmaHTCache {
// -CGI access in request
// CGI access makes the page very individual, and therefore not usable in caches
if ((isPOST(urlString)) && (!(profile.crawlingQ()))) return "dynamic_post";
if (isCGI(urlString)) return "dynamic_cgi";
if ((isPOST(nomalizedURLString)) && (!(profile.crawlingQ()))) return "dynamic_post";
if (isCGI(nomalizedURLString)) return "dynamic_cgi";
// -authorization cases in request
// authorization makes pages very individual, and therefore we cannot use the
@ -622,8 +622,8 @@ public class plasmaHTCache {
// -CGI access in request
// CGI access makes the page very individual, and therefore not usable in caches
if (isPOST(urlString)) return false;
if (isCGI(urlString)) return false;
if (isPOST(nomalizedURLString)) return false;
if (isCGI(nomalizedURLString)) return false;
// -authorization cases in request
if (requestHeader.containsKey("AUTHORIZATION")) return false;
@ -747,8 +747,8 @@ public class plasmaHTCache {
// -CGI access in request
// CGI access makes the page very individual, and therefore not usable in caches
if ((isPOST(urlString)) && (!(profile.crawlingQ()))) return "Dynamic_(POST)";
if ((isCGI(urlString)) && (!(profile.crawlingQ()))) return "Dynamic_(CGI)";
if ((isPOST(nomalizedURLString)) && (!(profile.crawlingQ()))) return "Dynamic_(POST)";
if ((isCGI(nomalizedURLString)) && (!(profile.crawlingQ()))) return "Dynamic_(CGI)";
// -authorization cases in request
// we checked that in shallStoreCache
@ -759,7 +759,7 @@ public class plasmaHTCache {
// a picture cannot be indexed
if (isPicture(responseHeader)) return "Media_Content_(Picture)";
if (!(isText(responseHeader))) return "Media_Content_(not_text)";
if (noIndexingURL(urlString)) return "Media_Content_(forbidden)";
if (noIndexingURL(nomalizedURLString)) return "Media_Content_(forbidden)";
// -if-modified-since in request
@ -864,8 +864,8 @@ public class plasmaHTCache {
// -CGI access in request
// CGI access makes the page very individual, and therefore not usable in caches
if ((isPOST(urlString)) && (!(profile.crawlingQ()))) return "Dynamic_(POST)";
if ((isCGI(urlString)) && (!(profile.crawlingQ()))) return "Dynamic_(CGI)";
if ((isPOST(nomalizedURLString)) && (!(profile.crawlingQ()))) return "Dynamic_(POST)";
if ((isCGI(nomalizedURLString)) && (!(profile.crawlingQ()))) return "Dynamic_(CGI)";
// -authorization cases in request
// we checked that in shallStoreCache
@ -876,7 +876,7 @@ public class plasmaHTCache {
// a picture cannot be indexed
if (isPicture(responseHeader)) return "Media_Content_(Picture)";
if (!(isText(responseHeader))) return "Media_Content_(not_text)";
if (noIndexingURL(urlString)) return "Media_Content_(forbidden)";
if (noIndexingURL(nomalizedURLString)) return "Media_Content_(forbidden)";
// -if-modified-since in request
// if the page is fresh at the very moment we can index it

@ -393,7 +393,6 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
public synchronized void deQueue() {
if (serverJobs < 5) {
if (processStack.size() > 0) {
log.logDebug("DEQUEUE: dequeueing one step (processStack=" + processStack.size() + ", localStackSize=" + noticeURL.localStackSize() + ", remoteStackSize=" + noticeURL.remoteStackSize() + ")");
processResourceStack((plasmaHTCache.Entry) processStack.removeFirst());
}
} else {
@ -469,7 +468,13 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
private synchronized void processResourceStack(plasmaHTCache.Entry entry) {
// work off one stack entry with a fresh resource (scraped web page)
if ((entry.cacheArray != null) || (entry.scraper != null)) try {
String stats = "DEQUEUE: dequeueing one step (processStack=" + processStack.size() + ", localStackSize=" + noticeURL.localStackSize() + ", remoteStackSize=" + noticeURL.remoteStackSize() + ")";
if ((entry.cacheArray == null) && (entry.scraper == null)) {
log.logDebug(stats + " entry for " + entry.nomalizedURLString + " has no content -- skipped");
return;
}
try {
// we must distinguish the following cases: resource-load was initiated by
// 1) global crawling: the index is extern, not here (not possible here)
// 2) result of search queries, some indexes are here (not possible here)
@ -492,15 +497,15 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
processCase = 6;
}
log.logDebug("processResourceStack: processCase=" + processCase + ", depth=" + entry.depth + ", maxDepth=" + entry.profile.generalDepth() + ", filter=" + entry.profile.generalFilter() + ", initiatorHash=" + initiatorHash + ", status=" + entry.status + ", url=" + entry.url); // DEBUG
log.logDebug(stats + " processCase=" + processCase + ", depth=" + entry.depth + ", maxDepth=" + entry.profile.generalDepth() + ", filter=" + entry.profile.generalFilter() + ", initiatorHash=" + initiatorHash + ", status=" + entry.status + ", source=" + ((entry.cacheArray == null) ? "scraper" : "byte[]") + ", url=" + entry.nomalizedURLString); // DEBUG
// parse content
plasmaParser.document document;
if (entry.scraper != null) {
log.logDebug("(Parser) '" + entry.urlString + "' is pre-parsed by scraper");
log.logDebug("(Parser) '" + entry.nomalizedURLString + "' is pre-parsed by scraper");
document = parser.transformScraper(entry.url, entry.responseHeader.mime(), entry.scraper);
} else {
log.logDebug("(Parser) '" + entry.urlString + "' is not parsed, parsing now");
log.logDebug("(Parser) '" + entry.nomalizedURLString + "' is not parsed, parsing now");
document = parser.parseSource(entry.url, entry.responseHeader.mime(), entry.cacheArray);
}
@ -516,11 +521,11 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
while (i.hasNext()) {
e = (Map.Entry) i.next();
nexturlstring = (String) e.getKey();
rejectReason = stackCrawl(nexturlstring, entry.urlString, initiatorHash, (String) e.getValue(), entry.lastModified, entry.depth + 1, entry.profile);
rejectReason = stackCrawl(nexturlstring, entry.nomalizedURLString, initiatorHash, (String) e.getValue(), entry.lastModified, entry.depth + 1, entry.profile);
if (rejectReason == null) {
c++;
} else {
errorURL.newEntry(new URL(nexturlstring), entry.urlString, entry.initiator(), yacyCore.seedDB.mySeed.hash,
errorURL.newEntry(new URL(nexturlstring), entry.nomalizedURLString, entry.initiator(), yacyCore.seedDB.mySeed.hash,
(String) e.getValue(), rejectReason, new bitfield(plasmaURL.urlFlagLength), false);
}
}
@ -543,12 +548,12 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
}
if (noIndexReason == null) {
// strip out words
log.logDebug("(Profile) Condensing for '" + entry.urlString + "'");
log.logDebug("(Profile) Condensing for '" + entry.nomalizedURLString + "'");
plasmaCondenser condenser = new plasmaCondenser(new ByteArrayInputStream(document.getText()));
//log.logInfo("INDEXING HEADLINE:" + descr);
try {
log.logDebug("(Profile) Create LURL-Entry for '" + entry.urlString + "'");
log.logDebug("(Profile) Create LURL-Entry for '" + entry.nomalizedURLString + "'");
plasmaCrawlLURL.entry newEntry = loadedURL.newEntry(
entry.url, descr, entry.lastModified, new Date(),
initiatorHash,
@ -563,28 +568,28 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
);
String urlHash = newEntry.hash();
log.logDebug("(Profile) Remove NURL for '" + entry.urlString + "'");
log.logDebug("(Profile) Remove NURL for '" + entry.nomalizedURLString + "'");
noticeURL.remove(urlHash); // worked-off
if (((processCase == 4) || (processCase == 5) || (processCase == 6)) &&
(entry.profile.localIndexing())) {
// remove stopwords
log.logDebug("(Profile) Exclude Stopwords for '" + entry.urlString + "'");
log.logDebug("(Profile) Exclude Stopwords for '" + entry.nomalizedURLString + "'");
log.logInfo("Excluded " + condenser.excludeWords(stopwords) + " words in URL " + entry.url);
//System.out.println("DEBUG: words left to be indexed: " + condenser.getWords());
// do indexing
log.logDebug("(Profile) Create Index for '" + entry.urlString + "'");
log.logDebug("(Profile) Create Index for '" + entry.nomalizedURLString + "'");
int words = searchManager.addPageIndex(entry.url, urlHash, entry.lastModified, condenser, entry.language, entry.doctype);
log.logInfo("Indexed " + words + " words in URL " + entry.url + " (" + descr + ")");
// if this was performed for a remote crawl request, notify requester
if ((processCase == 6) && (initiator != null)) {
log.logInfo("Sending crawl receipt for '" + entry.urlString + "' to " + initiator.getName());
log.logInfo("Sending crawl receipt for '" + entry.nomalizedURLString + "' to " + initiator.getName());
yacyClient.crawlReceipt(initiator, "crawl", "fill", "indexed", newEntry, "");
}
} else {
log.logDebug("Resource '" + entry.urlString + "' not indexed (indexing is off)");
log.logDebug("Resource '" + entry.nomalizedURLString + "' not indexed (indexing is off)");
}
} catch (Exception ee) {
log.logError("Could not index URL " + entry.url + ": " + ee.getMessage());
@ -831,11 +836,12 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
}
private static SimpleDateFormat DateFormatter = new SimpleDateFormat("EEE, dd MMM yyyy");
public static String dateString(Date date) {
if (date == null) return ""; else return DateFormatter.format(date);
}
public serverObjects searchFromLocal(Set querywords, String order1, String order2, int count, boolean global, long time /*milliseconds*/, String urlmask) {
@ -911,7 +917,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
filename = url.getFile();
if ((seed == null) || ((address = seed.getAddress()) == null)) {
// seed is not known from here
removeReferences(urlentry.hash(), getWords(("yacyshare " + filename.replace('?', ' ') + " " + urlentry.descr()).getBytes()));
removeReferences(urlentry.hash(), plasmaCondenser.getWords(("yacyshare " + filename.replace('?', ' ') + " " + urlentry.descr()).getBytes()));
loadedURL.remove(urlentry.hash()); // clean up
continue; // next result
}
@ -1062,28 +1068,6 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
// actually it is used there for testing purpose
return "PROPS: " + super.toString() + "; QUEUE: " + processStack.toString();
}
/*
private void addScoreForked(kelondroMScoreCluster ref, String no, String[] words) {
String s;
if (words != null) for (int i = 0; i < words.length; i++) {
s = words[i].trim().toLowerCase();
if (s.indexOf(".") >= 0) addScoreForked(ref, no, s.split("\\."));
else if (s.indexOf(",") >= 0) addScoreForked(ref, no, s.split(","));
else if (s.indexOf(":") >= 0) addScoreForked(ref, no, s.split(":"));
else if (s.indexOf("-") >= 0) addScoreForked(ref, no, s.split("-"));
else if (s.indexOf("/") >= 0) addScoreForked(ref, no, s.split("/"));
else if (s.indexOf('"') >= 0) addScoreForked(ref, no, s.split(new String(new byte[] {(char)'"'})));
else addScoreFiltered(ref, no, s);
}
}
private void addScoreFiltered(kelondroMScoreCluster ref, String no, String word) {
if ((word.length() > 2) &&
("http_html_ftp_www_com_org_net_gov_edu_index_home_page_for_usage_the_and_".indexOf(word) < 0) &&
(no.indexOf(word) < 0))
ref.incScore(word);
}
*/
// method for index deletion
public int removeAllUrlReferences(URL url, boolean fetchOnline) {
@ -1099,7 +1083,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
URL url = entry.url();
if (url == null) return 0;
// get set of words
Set words = getWords(getText(getResource(url, fetchOnline)));
Set words = plasmaCondenser.getWords(getText(getResource(url, fetchOnline)));
// delete all word references
int count = removeReferences(urlhash, words);
// finally delete the url entry itself
@ -1172,17 +1156,6 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
}
}
public static Set getWords(byte[] text) {
if (text == null) return null;
ByteArrayInputStream buffer = new ByteArrayInputStream(text);
try {
plasmaCondenser condenser = new plasmaCondenser(buffer);
return condenser.getWords();
} catch (IOException e) {
return null;
}
}
public class distributeIndex {
// distributes parts of the index to other peers
// stops as soon as an error occurrs
@ -1214,6 +1187,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
((transferred = performTransferIndex(indexCount, peerCount, true)) > 0)) {
indexCount = transferred;
if ((System.currentTimeMillis() - starttime) > (maxTime * peerCount)) indexCount--; else indexCount++;
if (indexCount < 30) indexCount = 30;
return true;
} else {
// make a long pause
@ -1230,6 +1204,7 @@ public class plasmaSwitchboard extends serverAbstractSwitch implements serverSwi
public void setCounts(int indexCount, int peerCount, long pause) {
this.indexCount = indexCount;
if (indexCount < 30) indexCount = 30;
this.peerCount = peerCount;
this.pause = pause;
}

@ -63,7 +63,6 @@ public class yacyDHTAction implements yacyPeerAction {
}
}
public Enumeration getDHTSeeds(boolean up, String firstHash) {
// enumerates seed-type objects: all seeds with starting point in the middle, rotating at the end/beginning
return new seedDHTEnum(up, firstHash);
@ -104,10 +103,8 @@ public class yacyDHTAction implements yacyPeerAction {
return e2.nextElement();
}
}
}
public Enumeration getAcceptRemoteIndexSeeds(String starthash) {
// returns an enumeration of yacySeed-Objects
// that have the AcceptRemoteIndex-Flag set
@ -131,10 +128,16 @@ public class yacyDHTAction implements yacyPeerAction {
private yacySeed nextInternal() {
yacySeed s;
while (se.hasMoreElements()) {
s = (yacySeed) se.nextElement();
if (s == null) return null;
if (s.getFlagAcceptRemoteIndex()) return s;
try {
while (se.hasMoreElements()) {
s = (yacySeed) se.nextElement();
if (s == null) return null;
if (s.getFlagAcceptRemoteIndex()) return s;
}
} catch (kelondroException e) {
yacyCore.log.logError("database inconsistency (" + e.getMessage() + "), re-set of db.");
seedDB.resetActiveTable();
return null;
}
return null;
}
@ -147,7 +150,6 @@ public class yacyDHTAction implements yacyPeerAction {
}
public Enumeration getAcceptRemoteCrawlSeeds(String starthash, boolean available) {
return new acceptRemoteCrawlSeedEnum(starthash, available);
}

@ -142,7 +142,7 @@ public class yacySeedDB {
private synchronized kelondroMap resetSeedTable(kelondroMap seedDB, File seedDBFile) {
// this is an emergency function that should only be used if any problem with the
// seed.db is detected
yacyCore.log.logError("seed-db " + seedDBFile.toString() + " reset (on-the-fly)");
yacyCore.log.logDebug("seed-db " + seedDBFile.toString() + " reset (on-the-fly)");
try {
seedDB.close();
seedDBFile.delete();
@ -154,6 +154,10 @@ public class yacySeedDB {
return seedDB;
}
public synchronized void resetActiveTable() { seedActiveDB = resetSeedTable(seedActiveDB, seedActiveDBFile); }
public synchronized void resetPassiveTable() { seedPassiveDB = resetSeedTable(seedPassiveDB, seedPassiveDBFile); }
public synchronized void resetPotentialTable() { seedPotentialDB = resetSeedTable(seedPotentialDB, seedPotentialDBFile); }
public void close() {
try {
seedActiveDB.close();

@ -1 +0,0 @@
testblue
Loading…
Cancel
Save