From c69c462a15da962694edb1636897b26d0f2c45a3 Mon Sep 17 00:00:00 2001 From: sgaebel Date: Sat, 20 Mar 2021 14:46:38 +0100 Subject: [PATCH] replaces a expensive getLoadTimeURL() by exists() refactors urlExists to getHarvestProcess as that is what it does --- htroot/Load_RSS_p.java | 105 ++++++++---------- .../net/yacy/crawler/retrieval/RSSLoader.java | 17 +-- .../crawler/retrieval/SitemapImporter.java | 22 ++-- source/net/yacy/search/Switchboard.java | 7 +- 4 files changed, 62 insertions(+), 89 deletions(-) diff --git a/htroot/Load_RSS_p.java b/htroot/Load_RSS_p.java index 7ce395de3..81d9b8667 100644 --- a/htroot/Load_RSS_p.java +++ b/htroot/Load_RSS_p.java @@ -336,17 +336,13 @@ public class Load_RSS_p { final List urlsToIndex = new ArrayList(); loop: for (final Map.Entry entry: hash2UrlMap.entrySet()) { - try { - final DigestURL messageUrl = entry.getValue(); - HarvestProcess harvestProcess = sb.urlExists(ASCII.String(messageUrl.hash())); - if (harvestProcess != null) { - continue loop; - } - urlsToIndex.add(messageUrl); - RSSLoader.indexTriggered.insertIfAbsent(messageUrl.hash(), new Date()); - } catch (final IOException e) { - ConcurrentLog.logException(e); + final DigestURL messageUrl = entry.getValue(); + HarvestProcess harvestProcess = sb.getHarvestProcess(ASCII.String(messageUrl.hash())); + if (harvestProcess != null) { + continue loop; } + urlsToIndex.add(messageUrl); + RSSLoader.indexTriggered.insertIfAbsent(messageUrl.hash(), new Date()); } sb.addToIndex(urlsToIndex, null, null, collections, true); @@ -413,57 +409,48 @@ public class Load_RSS_p { } pubDate = item.getPubDate(); - HarvestProcess harvestProcess; - try { - if(link != null && StringUtils.isNotEmpty(item.getGuid())) { - harvestProcess = sb.urlExists(ASCII.String(link.hash())); - - prop.put("showitems_item_" + i + "_hasLink", true); - prop.putHTML("showitems_item_" + i + "_hasLink_link", link.toNormalform(true)); - final int state = harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(link.hash()) ? 1 : 0; - prop.put("showitems_item_" + i + "_state", state); - prop.put("showitems_item_" + i + "_indexable", state == 0); - prop.put("showitems_item_" + i + "_indexable_count", i); - prop.putHTML("showitems_item_" + i + "_indexable_inputValue", (link == enclosure ? CHECKBOX_MEDIA_ITEM_PREFIX : CHECKBOX_ITEM_PREFIX) + item.getGuid()); - } else { - prop.put("showitems_item_" + i + "_state", 0); - prop.put("showitems_item_" + i + "_indexable", false); - prop.put("showitems_item_" + i + "_hasLink", false); - } - prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author); + if(link != null && StringUtils.isNotEmpty(item.getGuid())) { + HarvestProcess harvestProcess = sb.getHarvestProcess(ASCII.String(link.hash())); + + prop.put("showitems_item_" + i + "_hasLink", true); + prop.putHTML("showitems_item_" + i + "_hasLink_link", link.toNormalform(true)); + final int state = harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(link.hash()) ? 1 : 0; + prop.put("showitems_item_" + i + "_state", state); + prop.put("showitems_item_" + i + "_indexable", state == 0); + prop.put("showitems_item_" + i + "_indexable_count", i); + prop.putHTML("showitems_item_" + i + "_indexable_inputValue", (link == enclosure ? CHECKBOX_MEDIA_ITEM_PREFIX : CHECKBOX_ITEM_PREFIX) + item.getGuid()); + } else { + prop.put("showitems_item_" + i + "_state", 0); + prop.put("showitems_item_" + i + "_indexable", false); + prop.put("showitems_item_" + i + "_hasLink", false); + } + prop.putHTML("showitems_item_" + i + "_author", author == null ? "" : author); + prop.putHTML("showitems_item_" + i + "_title", item.getTitle()); + prop.putHTML("showitems_item_" + i + "_description", item.getDescriptions().toString()); + prop.put("showitems_item_" + i + "_defaultMediaDesc", false); + prop.putHTML("showitems_item_" + i + "_language", item.getLanguage()); + prop.putHTML("showitems_item_" + i + "_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate)); + i++; + + if(enclosure != null && enclosure != link && StringUtils.isNotEmpty(item.getGuid())) { + HarvestProcess harvestProcess = sb.getHarvestProcess(ASCII.String(enclosure.hash())); + + prop.put("showitems_item_" + i + "_hasLink", true); + prop.putHTML("showitems_item_" + i + "_hasLink_link", enclosure.toNormalform(true)); + final int state = harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(enclosure.hash()) ? 1 : 0; + prop.put("showitems_item_" + i + "_state", state); + prop.put("showitems_item_" + i + "_indexable", state == 0); + prop.put("showitems_item_" + i + "_indexable_count", i); + prop.putHTML("showitems_item_" + i + "_indexable_inputValue", "media_" + item.getGuid()); + prop.putHTML("showitems_item_" + i + "_author", ""); prop.putHTML("showitems_item_" + i + "_title", item.getTitle()); - prop.putHTML("showitems_item_" + i + "_description", item.getDescriptions().toString()); - prop.put("showitems_item_" + i + "_defaultMediaDesc", false); - prop.putHTML("showitems_item_" + i + "_language", item.getLanguage()); - prop.putHTML("showitems_item_" + i + "_date", (pubDate == null) ? "" : DateFormat.getDateTimeInstance().format(pubDate)); + prop.putHTML("showitems_item_" + i + "_description", ""); + /* Description is already used for the main item link, use here a default one */ + prop.put("showitems_item_" + i + "_defaultMediaDesc", true); + prop.putHTML("showitems_item_" + i + "_language", ""); + prop.putHTML("showitems_item_" + i + "_date", ""); i++; - } catch (IOException e) { - ConcurrentLog.logException(e); - } - - try { - if(enclosure != null && enclosure != link && StringUtils.isNotEmpty(item.getGuid())) { - harvestProcess = sb.urlExists(ASCII.String(enclosure.hash())); - - prop.put("showitems_item_" + i + "_hasLink", true); - prop.putHTML("showitems_item_" + i + "_hasLink_link", enclosure.toNormalform(true)); - final int state = harvestProcess != null ? 2 : RSSLoader.indexTriggered.containsKey(enclosure.hash()) ? 1 : 0; - prop.put("showitems_item_" + i + "_state", state); - prop.put("showitems_item_" + i + "_indexable", state == 0); - prop.put("showitems_item_" + i + "_indexable_count", i); - prop.putHTML("showitems_item_" + i + "_indexable_inputValue", "media_" + item.getGuid()); - prop.putHTML("showitems_item_" + i + "_author", ""); - prop.putHTML("showitems_item_" + i + "_title", item.getTitle()); - prop.putHTML("showitems_item_" + i + "_description", ""); - /* Description is already used for the main item link, use here a default one */ - prop.put("showitems_item_" + i + "_defaultMediaDesc", true); - prop.putHTML("showitems_item_" + i + "_language", ""); - prop.putHTML("showitems_item_" + i + "_date", ""); - i++; - } - } catch (IOException e) { - ConcurrentLog.logException(e); - } + } } prop.put("showitems_item", i); prop.put("showitems_num", i); diff --git a/source/net/yacy/crawler/retrieval/RSSLoader.java b/source/net/yacy/crawler/retrieval/RSSLoader.java index 37324fdcf..31c959bc4 100644 --- a/source/net/yacy/crawler/retrieval/RSSLoader.java +++ b/source/net/yacy/crawler/retrieval/RSSLoader.java @@ -140,18 +140,13 @@ public class RSSLoader extends Thread { final List list = new ArrayList(); for (final Map.Entry e: urlmap.entrySet()) { - HarvestProcess harvestProcess; - try { - harvestProcess = sb.urlExists(e.getKey()); - if (harvestProcess != null) { - continue; - } - list.add(e.getValue()); - indexTriggered.insertIfAbsent(ASCII.getBytes(e.getKey()), new Date()); - loadCount++; - } catch (IOException e1) { - ConcurrentLog.logException(e1); + HarvestProcess harvestProcess = sb.getHarvestProcess(e.getKey()); + if (harvestProcess != null) { + continue; } + list.add(e.getValue()); + indexTriggered.insertIfAbsent(ASCII.getBytes(e.getKey()), new Date()); + loadCount++; } sb.addToIndex(list, null, null, collections, true); // update info for loading diff --git a/source/net/yacy/crawler/retrieval/SitemapImporter.java b/source/net/yacy/crawler/retrieval/SitemapImporter.java index cd2d20e2b..1bceedf13 100644 --- a/source/net/yacy/crawler/retrieval/SitemapImporter.java +++ b/source/net/yacy/crawler/retrieval/SitemapImporter.java @@ -25,7 +25,6 @@ package net.yacy.crawler.retrieval; -import java.io.IOException; import java.net.MalformedURLException; import java.util.Date; @@ -84,20 +83,15 @@ public class SitemapImporter extends Thread { // check if the url is known and needs to be recrawled Date lastMod = entry.lastmod(null); if (lastMod != null) { - HarvestProcess dbocc; - try { - dbocc = this.sb.urlExists(ASCII.String(nexturlhash)); - if (dbocc != null && dbocc == HarvestProcess.LOADED) { - // the url was already loaded. we need to check the date - final URIMetadataNode oldEntry = this.sb.index.fulltext().getMetadata(nexturlhash); - if (oldEntry != null) { - final Date modDate = oldEntry.moddate(); - // check if modDate is null - if (modDate.after(lastMod)) return; - } + HarvestProcess dbocc = this.sb.getHarvestProcess(ASCII.String(nexturlhash)); + if (dbocc != null && dbocc == HarvestProcess.LOADED) { + // the url was already loaded. we need to check the date + final URIMetadataNode oldEntry = this.sb.index.fulltext().getMetadata(nexturlhash); + if (oldEntry != null) { + final Date modDate = oldEntry.moddate(); + // check if modDate is null + if (modDate.after(lastMod)) return; } - } catch (IOException e) { - ConcurrentLog.logException(e); } } diff --git a/source/net/yacy/search/Switchboard.java b/source/net/yacy/search/Switchboard.java index 2e08b6ca9..ed9239c52 100644 --- a/source/net/yacy/search/Switchboard.java +++ b/source/net/yacy/search/Switchboard.java @@ -122,7 +122,6 @@ import net.yacy.cora.document.id.MultiProtocolURL; import net.yacy.cora.federate.solr.FailCategory; import net.yacy.cora.federate.solr.Ranking; import net.yacy.cora.federate.solr.connector.ShardSelection; -import net.yacy.cora.federate.solr.connector.SolrConnector.LoadTimeURL; import net.yacy.cora.federate.solr.instance.EmbeddedInstance; import net.yacy.cora.federate.solr.instance.RemoteInstance; import net.yacy.cora.federate.yacy.CacheStrategy; @@ -1437,7 +1436,6 @@ public final class Switchboard extends serverSwitch { i++; } } catch ( final NoSuchAlgorithmException e1 ) { - // TODO Auto-generated catch block ConcurrentLog.logException(e1); } @@ -1906,9 +1904,8 @@ public final class Switchboard extends serverSwitch { * @param hash * @return if it exists, the name of the database is returned, if it not exists, null is returned */ - public HarvestProcess urlExists(final String hash) throws IOException { - LoadTimeURL md = this.index.fulltext().getDefaultConnector().getLoadTimeURL(hash); - if (md != null && md.date >= 0) return HarvestProcess.LOADED; + public HarvestProcess getHarvestProcess(final String hash) { + if (this.index.fulltext().getDefaultConnector().exists(hash)) return HarvestProcess.LOADED; HarvestProcess hp = this.crawlQueues.exists(ASCII.getBytes(hash)); if (hp != null) return hp; return null; // todo: can also be in error