replaces a expensive getLoadTimeURL() by exists()

refactors urlExists to getHarvestProcess as that is what it does
pull/405/head
sgaebel 4 years ago
parent a5488ac8f5
commit c69c462a15

@ -336,17 +336,13 @@ public class Load_RSS_p {
final List<DigestURL> urlsToIndex = new ArrayList<DigestURL>();
loop: for (final Map.Entry<String, DigestURL> entry: hash2UrlMap.entrySet()) {
try {
final DigestURL messageUrl = entry.getValue();
HarvestProcess harvestProcess = sb.urlExists(ASCII.String(messageUrl.hash()));
if (harvestProcess != null) {
continue loop;
}
urlsToIndex.add(messageUrl);
RSSLoader.indexTriggered.insertIfAbsent(messageUrl.hash(), new Date());
} catch (final IOException e) {
ConcurrentLog.logException(e);
final DigestURL messageUrl = entry.getValue();
HarvestProcess harvestProcess = sb.getHarvestProcess(ASCII.String(messageUrl.hash()));
if (harvestProcess != null) {
continue loop;
}
urlsToIndex.add(messageUrl);
RSSLoader.indexTriggered.insertIfAbsent(messageUrl.hash(), new Date());
}
sb.addToIndex(urlsToIndex, null, null, collections, true);
@ -413,57 +409,48 @@ public class Load_RSS_p {
}
pubDate = item.getPubDate();
HarvestProcess harvestProcess;
try {
if(link != null && StringUtils.isNotEmpty(item.getGuid())) {
harvestProcess = sb.urlExists(ASCII.String(link.hash()));
prop.put("showitems_item_" + i + "_hasLink", true);
prop.putHTML("showitems_item_" + i + "_hasLink_link", link.toNormalform(true));
final int state = harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(link.hash()) ? 1 : 0;
prop.put("showitems_item_" + i + "_state", state);
prop.put("showitems_item_" + i + "_indexable", state == 0);
prop.put("showitems_item_" + i + "_indexable_count", i);
prop.putHTML("showitems_item_" + i + "_indexable_inputValue", (link == enclosure ? CHECKBOX_MEDIA_ITEM_PREFIX : CHECKBOX_ITEM_PREFIX) + item.getGuid());
} else {
prop.put("showitems_item_" + i + "_state", 0);
prop.put("showitems_item_" + i + "_indexable", false);
prop.put("showitems_item_" + i + "_hasLink", false);
}
prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author);
if(link != null && StringUtils.isNotEmpty(item.getGuid())) {
HarvestProcess harvestProcess = sb.getHarvestProcess(ASCII.String(link.hash()));
prop.put("showitems_item_" + i + "_hasLink", true);
prop.putHTML("showitems_item_" + i + "_hasLink_link", link.toNormalform(true));
final int state = harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(link.hash()) ? 1 : 0;
prop.put("showitems_item_" + i + "_state", state);
prop.put("showitems_item_" + i + "_indexable", state == 0);
prop.put("showitems_item_" + i + "_indexable_count", i);
prop.putHTML("showitems_item_" + i + "_indexable_inputValue", (link == enclosure ? CHECKBOX_MEDIA_ITEM_PREFIX : CHECKBOX_ITEM_PREFIX) + item.getGuid());
} else {
prop.put("showitems_item_" + i + "_state", 0);
prop.put("showitems_item_" + i + "_indexable", false);
prop.put("showitems_item_" + i + "_hasLink", false);
}
prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author);
prop.putHTML("showitems_item_" + i + "_title", item.getTitle());
prop.putHTML("showitems_item_" + i + "_description", item.getDescriptions().toString());
prop.put("showitems_item_" + i + "_defaultMediaDesc", false);
prop.putHTML("showitems_item_" + i + "_language", item.getLanguage());
prop.putHTML("showitems_item_" + i + "_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate));
i++;
if(enclosure != null && enclosure != link && StringUtils.isNotEmpty(item.getGuid())) {
HarvestProcess harvestProcess = sb.getHarvestProcess(ASCII.String(enclosure.hash()));
prop.put("showitems_item_" + i + "_hasLink", true);
prop.putHTML("showitems_item_" + i + "_hasLink_link", enclosure.toNormalform(true));
final int state = harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(enclosure.hash()) ? 1 : 0;
prop.put("showitems_item_" + i + "_state", state);
prop.put("showitems_item_" + i + "_indexable", state == 0);
prop.put("showitems_item_" + i + "_indexable_count", i);
prop.putHTML("showitems_item_" + i + "_indexable_inputValue", "media_" + item.getGuid());
prop.putHTML("showitems_item_" + i + "_author", "");
prop.putHTML("showitems_item_" + i + "_title", item.getTitle());
prop.putHTML("showitems_item_" + i + "_description", item.getDescriptions().toString());
prop.put("showitems_item_" + i + "_defaultMediaDesc", false);
prop.putHTML("showitems_item_" + i + "_language", item.getLanguage());
prop.putHTML("showitems_item_" + i + "_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate));
prop.putHTML("showitems_item_" + i + "_description", "");
/* Description is already used for the main item link, use here a default one */
prop.put("showitems_item_" + i + "_defaultMediaDesc", true);
prop.putHTML("showitems_item_" + i + "_language", "");
prop.putHTML("showitems_item_" + i + "_date", "");
i++;
} catch (IOException e) {
ConcurrentLog.logException(e);
}
try {
if(enclosure != null && enclosure != link && StringUtils.isNotEmpty(item.getGuid())) {
harvestProcess = sb.urlExists(ASCII.String(enclosure.hash()));
prop.put("showitems_item_" + i + "_hasLink", true);
prop.putHTML("showitems_item_" + i + "_hasLink_link", enclosure.toNormalform(true));
final int state = harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(enclosure.hash()) ? 1 : 0;
prop.put("showitems_item_" + i + "_state", state);
prop.put("showitems_item_" + i + "_indexable", state == 0);
prop.put("showitems_item_" + i + "_indexable_count", i);
prop.putHTML("showitems_item_" + i + "_indexable_inputValue", "media_" + item.getGuid());
prop.putHTML("showitems_item_" + i + "_author", "");
prop.putHTML("showitems_item_" + i + "_title", item.getTitle());
prop.putHTML("showitems_item_" + i + "_description", "");
/* Description is already used for the main item link, use here a default one */
prop.put("showitems_item_" + i + "_defaultMediaDesc", true);
prop.putHTML("showitems_item_" + i + "_language", "");
prop.putHTML("showitems_item_" + i + "_date", "");
i++;
}
} catch (IOException e) {
ConcurrentLog.logException(e);
}
}
}
prop.put("showitems_item", i);
prop.put("showitems_num", i);

@ -140,18 +140,13 @@ public class RSSLoader extends Thread {
final List<DigestURL> list = new ArrayList<DigestURL>();
for (final Map.Entry<String, DigestURL> e: urlmap.entrySet()) {
HarvestProcess harvestProcess;
try {
harvestProcess = sb.urlExists(e.getKey());
if (harvestProcess != null) {
continue;
}
list.add(e.getValue());
indexTriggered.insertIfAbsent(ASCII.getBytes(e.getKey()), new Date());
loadCount++;
} catch (IOException e1) {
ConcurrentLog.logException(e1);
HarvestProcess harvestProcess = sb.getHarvestProcess(e.getKey());
if (harvestProcess != null) {
continue;
}
list.add(e.getValue());
indexTriggered.insertIfAbsent(ASCII.getBytes(e.getKey()), new Date());
loadCount++;
}
sb.addToIndex(list, null, null, collections, true);
// update info for loading

@ -25,7 +25,6 @@
package net.yacy.crawler.retrieval;
import java.io.IOException;
import java.net.MalformedURLException;
import java.util.Date;
@ -84,20 +83,15 @@ public class SitemapImporter extends Thread {
// check if the url is known and needs to be recrawled
Date lastMod = entry.lastmod(null);
if (lastMod != null) {
HarvestProcess dbocc;
try {
dbocc = this.sb.urlExists(ASCII.String(nexturlhash));
if (dbocc != null && dbocc == HarvestProcess.LOADED) {
// the url was already loaded. we need to check the date
final URIMetadataNode oldEntry = this.sb.index.fulltext().getMetadata(nexturlhash);
if (oldEntry != null) {
final Date modDate = oldEntry.moddate();
// check if modDate is null
if (modDate.after(lastMod)) return;
}
HarvestProcess dbocc = this.sb.getHarvestProcess(ASCII.String(nexturlhash));
if (dbocc != null && dbocc == HarvestProcess.LOADED) {
// the url was already loaded. we need to check the date
final URIMetadataNode oldEntry = this.sb.index.fulltext().getMetadata(nexturlhash);
if (oldEntry != null) {
final Date modDate = oldEntry.moddate();
// check if modDate is null
if (modDate.after(lastMod)) return;
}
} catch (IOException e) {
ConcurrentLog.logException(e);
}
}

@ -122,7 +122,6 @@ import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.federate.solr.FailCategory;
import net.yacy.cora.federate.solr.Ranking;
import net.yacy.cora.federate.solr.connector.ShardSelection;
import net.yacy.cora.federate.solr.connector.SolrConnector.LoadTimeURL;
import net.yacy.cora.federate.solr.instance.EmbeddedInstance;
import net.yacy.cora.federate.solr.instance.RemoteInstance;
import net.yacy.cora.federate.yacy.CacheStrategy;
@ -1437,7 +1436,6 @@ public final class Switchboard extends serverSwitch {
i++;
}
} catch ( final NoSuchAlgorithmException e1 ) {
// TODO Auto-generated catch block
ConcurrentLog.logException(e1);
}
@ -1906,9 +1904,8 @@ public final class Switchboard extends serverSwitch {
* @param hash
* @return if it exists, the name of the database is returned, if it not exists, null is returned
*/
public HarvestProcess urlExists(final String hash) throws IOException {
LoadTimeURL md = this.index.fulltext().getDefaultConnector().getLoadTimeURL(hash);
if (md != null && md.date >= 0) return HarvestProcess.LOADED;
public HarvestProcess getHarvestProcess(final String hash) {
if (this.index.fulltext().getDefaultConnector().exists(hash)) return HarvestProcess.LOADED;
HarvestProcess hp = this.crawlQueues.exists(ASCII.getBytes(hash));
if (hp != null) return hp;
return null; // todo: can also be in error

Loading…
Cancel
Save