Merge branch 'master' of ssh://git@gitorious.org/yacy/rc1.git

pull/1/head
Michael Peter Christen 11 years ago
commit a2b66fe2eb

@ -77,7 +77,7 @@
<input type="text" name="newEntry" size="50" />
<input type="submit" name="addBlacklistEntry" value="Add URL pattern" />
</div>
<p>The right '*', after the '/', can be replaced by a <a href="http://docs.oracle.com/javase/1.5.0/docs/api/java/util/regex/Pattern.html" target="_blank">regex</a>.</p>
<p>The right '*', after the '/', can be replaced by a <a href="http://docs.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html" target="_blank">regular expression</a>.</p>
<ul>
<li>domain.net/fullpath</li>
<li>domain.net/*</li>
@ -85,7 +85,7 @@
<li>*.sub.domain.net/*</li>
<li>sub.domain.*/*</li>
<li>domain.*/*</li>
<li>a complete <a href="http://docs.oracle.com/javase/1.5.0/docs/api/java/util/regex/Pattern.html" target="_blank">regex</a> (slow)</li>
<li>a complete <a href="http://docs.oracle.com/javase/6/docs/api/java/util/regex/Pattern.html" target="_blank">regular expression</a> (slow)</li>
</ul>
</form>

@ -58,6 +58,7 @@ import net.yacy.search.index.Fulltext;
import net.yacy.search.index.Segment.ReferenceReport;
import net.yacy.search.index.Segment.ReferenceReportCache;
import net.yacy.search.query.QueryParams;
import net.yacy.search.schema.CollectionConfiguration.FailDoc;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@ -182,9 +183,6 @@ public class HostBrowser {
// collect hosts from crawler
final Map<String, Integer[]> crawler = (admin) ? sb.crawlQueues.noticeURL.getDomainStackHosts(StackType.LOCAL, sb.robots) : new HashMap<String, Integer[]>();
for (Map.Entry<String, Integer[]> host: crawler.entrySet()) {
hostscore.inc(host.getKey(), host.getValue()[0]);
}
// collect the errorurls
Map<String, ReversibleScoreMap<String>> exclfacets = admin ? fulltext.getDefaultConnector().getFacets(CollectionSchema.failtype_s.getSolrFieldName() + ":" + FailType.excl.name(), maxcount, CollectionSchema.host_s.getSolrFieldName()) : null;
@ -466,7 +464,8 @@ public class HostBrowser {
FailType failType = errorDocs.get(entry.getKey());
if (failType == null) {
// maybe this is only in the errorURL
prop.putHTML("files_list_" + c + "_type_stored_error", process == HarvestProcess.ERRORS ? sb.crawlQueues.errorURL.get(ASCII.String(uri.hash())).getFailReason() : "unknown error");
FailDoc faildoc = sb.crawlQueues.errorURL.get(ASCII.String(uri.hash()));
prop.putHTML("files_list_" + c + "_type_stored_error", process == HarvestProcess.ERRORS && faildoc != null ? faildoc.getFailReason() : "unknown error");
} else {
String ids = ASCII.String(uri.hash());
InfoCacheEntry ice = infoCache.get(ids);
@ -570,12 +569,12 @@ public class HostBrowser {
// get all urls from the index and store them here
for (String id: internalIDs) {
if (id.equals(urlhash)) continue; // no self-references
DigestURL u = fulltext.getURL(ASCII.getBytes(id));
DigestURL u = fulltext.getURL(id);
if (u != null) references_internal_urls.add(u.toNormalform(true));
}
for (String id: externalIDs) {
if (id.equals(urlhash)) continue; // no self-references
DigestURL u = fulltext.getURL(ASCII.getBytes(id));
DigestURL u = fulltext.getURL(id);
if (u != null) references_external_urls.add(u.toNormalform(true));
}
} catch (final IOException e) {

@ -374,7 +374,7 @@ public class IndexControlRWIs_p {
} catch (final SpaceExceededException e ) {
ConcurrentLog.logException(e);
}
url = segment.fulltext().getURL(b);
url = segment.fulltext().getURL(ASCII.String(b));
segment.fulltext().remove(b);
if ( url != null ) {
for ( final String supportedBlacklistType : supportedBlacklistTypes ) {
@ -401,7 +401,7 @@ public class IndexControlRWIs_p {
} catch (final SpaceExceededException e ) {
ConcurrentLog.logException(e);
}
url = segment.fulltext().getURL(b);
url = segment.fulltext().getURL(ASCII.String(b));
segment.fulltext().remove(b);
if ( url != null ) {
for ( final BlacklistType supportedBlacklistType : BlacklistType.values() ) {

@ -183,7 +183,7 @@ public class IndexControlURLs_p {
}
if (post.containsKey("urlhashdelete")) {
final DigestURL url = segment.fulltext().getURL(ASCII.getBytes(urlhash));
final DigestURL url = segment.fulltext().getURL(urlhash);
if (url == null) {
prop.putHTML("result", "No Entry for URL hash " + urlhash + "; nothing deleted.");
} else {

@ -34,7 +34,7 @@ public class add_ymark {
if (post.containsKey("urlHash")) {
final String urlHash = post.get("urlHash",YMarkUtil.EMPTY_STRING);
final DigestURL url = sb.index.fulltext().getURL(urlHash.getBytes());
final DigestURL url = sb.index.fulltext().getURL(urlHash);
final String folders = post.get(YMarkEntry.BOOKMARK.FOLDERS.key(),YMarkEntry.BOOKMARK.FOLDERS.deflt());
final String tags = post.get(YMarkEntry.BOOKMARK.TAGS.key(),YMarkUtil.EMPTY_STRING);
try {

@ -595,7 +595,7 @@ public class yacysearch {
return prop;
}
final String bookmarkHash = post.get("bookmarkref", ""); // urlhash
final DigestURL url = indexSegment.fulltext().getURL(UTF8.getBytes(bookmarkHash));
final DigestURL url = indexSegment.fulltext().getURL(bookmarkHash);
if ( url != null ) {
try {
sb.tables.bookmarks.createBookmark(

@ -77,7 +77,7 @@ This is a list of searches that had been requested from remote peer search inter
#File: Blacklist_p.html
#---------------------------
Blacklist Administration==Blacklist Verwaltung
Used Blacklist engine:==Benutzte Blacklist Verwaltung:
#Used Blacklist engine:==Benutzte Blacklist Verwaltung:
This function provides an URL filter to the proxy; any blacklisted URL is blocked==Diese Funktion stellt einen URL-Filter vor den Proxy. Das Laden einer URL aus der Blacklist wird geblockt.
from being loaded. You can define several blacklists and activate them separately.==Sie können mehrere Blacklists anlegen und getrennt aktivieren.
You may also provide your blacklist to other peers by sharing them; in return you may==Sie können ebenfalls Ihre Blacklist einem anderen Peer zum Download anbieten.
@ -86,9 +86,9 @@ Active list:==Aktive Liste(n):
No blacklist selected==Keine Blacklist gewählt
Select list to edit:==Liste auswählen:
not shared::shared==nicht freigegeben::freigegeben
"select"=="wählen"
"select"=="Wählen"
Create new list:==Neue Liste anlegen:
"create"=="anlegen"
"create"=="Anlegen"
Settings for this list==Einstellungen dieser Liste
"Save"=="Speichern"
Share/don't share this list==Liste freigeben/nicht freigeben
@ -102,20 +102,22 @@ Move selected pattern\(s\) to==Verschiebe gewählten Eintrag zu
#You can select them here for deletion==Sie können sie einzeln zum Löschen wählen
Add new pattern:==Neuen Eintrag hinzufügen:
"Add URL pattern"=="URL hinzufügen"
The right \'\*\', after the \'\/\', can be replaced by a <a href=\"http://java.sun.com/j2se/1.5.0/docs/api/java/util/regex/Pattern.html\">regex</a>.==Der rechte Asterisk '*', nach dem '/', kann durch einen <a href="http://java.sun.com/j2se/1.5.0/docs/api/java/util/regex/Pattern.html">regulären Ausdruck</a> ersetzt werden.
The right \'\*\', after the \'\/\', can be replaced by a==Der rechte Asterisk '*', nach dem '/', kann ersetzt werden durch einen
>regular expression<==>regulären Ausdruck<
domain.net\/fullpath<==domain.de/vollerpfad<
>domain.net\/\*<==>domain.de/*<
\*.domain.net\/\*<==*.domain.de/*<
\*.sub.domain.net\/\*<==*.sub.domain.de/*<
#sub.domain.\*\/\*<==sub.domain.*/*<
#domain.\*\/\*<==domain.*/*<
a complete <a href=\"http://java.sun.com/j2se/1.5.0/docs/api/java/util/regex/Pattern.html\">regex</a> \(slow\)==ein ganzer <a href="http://java.sun.com/j2se/1.5.0/docs/api/java/util/regex/Pattern.html">Regulärer Ausdruck</a> (langsam)
a complete <==ein ganzer <
\(slow\)==(langsam)
#was removed from blacklist==wurde aus Blacklist entfernt
#was added to the blacklist==wurde zur Blacklist hinzugefügt
Activate this list for==Diese Liste ist gültig für
Show entries:==Zeige Einträge:
Entries per page:==Einträge pro Seite:
"Go"=="Los"
"set"=="Setzen"
Edit existing pattern\(s\):==Bearbeite existierende Einträge:
"Save URL pattern\(s\)"=="URL Einträge speichern"
#-----------------------------
@ -245,7 +247,7 @@ YaCy \'\#\[clientname\]\#\': Bookmarks==YaCy '#[clientname]#': Lesezeichen
The bookmarks list can also be retrieved as RSS feed. This can also be done when you select a specific tag.==Die Liste der Lesezeichen kann auch als RSS Feed abgerufen werden. Dies ist auch beim Auswählen eines bestimmten Tags möglich.
Click the API icon to load the RSS from the current selection.==Klicken Sie auf die API Sprechblase, um einen RSS Feed der aktuellen Auswahl zu laden.
To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de/wiki/index.php/Dev:API">API wiki page</a>.==Um eine Liste aller APIs zu sehen, besuchen Sie die <a href="http://www.yacy-websuche.de/wiki/index.php/Dev:API">API Seite im Wiki</a>.
To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de/wiki/index.php/Dev:API" target="_blank">API wiki page</a>.==Um eine Liste aller APIs zu sehen, besuchen Sie die <a href="http://www.yacy-websuche.de/wiki/index.php/Dev:API" target="_blank">API Seite im Wiki</a>.
<h3>Bookmarks==<h3>Lesezeichen
Bookmarks \(==Lesezeichen \(
#Login==Login
@ -302,7 +304,6 @@ Public Queue==Öffentlicher Puffer
Websearch Comparison==Vergleichs-Suche
Left Search Engine==linke Suchmaschine
Right Search Engine==rechte Suchmaschine
Query==Suchwort
"Compare"=="Vergleiche"
Search Result==Suchergebnis
#-----------------------------
@ -315,7 +316,7 @@ User created:==Benutzer erstellt:
User changed:==Benutzer geändert:
Generic error.==Genereller Fehler.
Passwords do not match.==Passwörter stimmen nicht überein.
Username too short. Username must be \>\= 4 Characters.==Benutzername zu kurz. Benutzername muss länger als vier Zeichen sein.
Username too short. Username must be &gt;= 4 Characters.==Benutzername zu kurz. Benutzername muss länger als vier Zeichen sein.
No password is set for the administration account.==Für den Administrator Zugang ist kein Passwort gesetzt.
Please define a password for the admin account.==Bitte setzen Sie ein Passwort für das admin Konto.
Admin Account==Admin Konto
@ -436,7 +437,7 @@ You can also use your peer without opening it, but this is not recomended.==Sie
#File: ConfigHeuristics_p.html
#---------------------------
Heuristics Configuration==Heuristik Konfiguration
A <a href=\"http://en.wikipedia.org/wiki/Heuristic\">heuristic</a> is an \'experience-based technique that help in problem solving, learning and discovery\' \(wikipedia\).==<a href="http://de.wikipedia.org/wiki/Heuristik">Heuristik</a> 'bezeichnet die Kunst, mit begrenztem Wissen und wenig Zeit zu guten Lösungen zu kommen.' (Wikipedia).
A <a href=\"http://en.wikipedia.org/wiki/Heuristic\" target="_blank">heuristic</a> is an \'experience-based technique that help in problem solving, learning and discovery\' \(wikipedia\).==<a href="http://de.wikipedia.org/wiki/Heuristik" target="_blank">Heuristik</a> 'bezeichnet die Kunst, mit begrenztem Wissen und wenig Zeit zu guten Lösungen zu kommen.' (Wikipedia).
The search heuristics that can be switched on here are techniques that help the discovery of possible search results based on link guessing, in-search crawling and requests to other search engines.==
Die Heuristik zur Suche die hier angeschalten werden können sind Techniken die helfen mögliche Suchergebnisse zu entdecken mit Hilfe von erratenen Links, Crawls während der Suche und Anfragen an andere Suchmaschinen.
When a search heuristic is used, the resulting links are not used directly as search result but the loaded pages are indexed and stored like other content.==Wenn eine Such Heuristik verwendet wird, werden die gefunden Links nicht direkt als Suchergebnisse angezeigt aber dafür die geladenen Seiten indexiert und mit dem anderen Inhalt abgespeichert.
@ -458,9 +459,32 @@ This means: right after the search request every page is loaded and every page t
If you check \'add as global crawl job\' the pages to be crawled are added to the global crawl queue \(remote peers can pickup pages to be crawled\).==Wenn 'als globaler Crawl hinzufügen' gewählt ist werden die zu indexierenden Seiten dem globalen Crawler hinzugefügt (entfernte Peers können beim Crawlen unterstützen).
Default is to add the links to the local crawl queue \(your peer crawls the linked pages\).==Vorgabe ist die Links der lokalen Crawl Queue hinzuzufügen.
add as global crawl job==als globaler Crawl hinzufügen
blekko: load external search result list from==blekko: lade externe Suchergebnisse von
When using this heuristic, then every search request line is used for a call to blekko.==Wenn diese Heuristik aktiv ist werden alle lokalen Suchanfragen an blekko weitergeleitet.
20 results are taken from blekko and loaded simultanously, parsed and indexed immediately.==Die ersten 20 Ergebnisse von blekko werden geladen und sofort indexiert.
opensearch load external search result list from active systems below==opensearch lade externe Suchergebnisse von aktiven System die unten gelistet sind
When using this heuristic, then every search request line is used for a call to listed opensearch systems until enough results to fill the current search page are available.==Wenn diese Heuristik genutzt wird, dann wird jede Suchanfragezeile für einen Aufruf der aufgelisteten OpenSearch Systeme bis genügend Resultate verfügbar sind, um die aktuelle Suchseite zu füllen.
20 results are taken from remote system and loaded simultanously, parsed and indexed immediately.==20 Resultate werden vom remote System genommen und simultan geladen, geparsed und sofort indexiert.
To find out more about OpenSearch see==Um mehr über OpenSearch zu erfahren besuche
#>OpenSearch.org<==>OpenSearch.org<
Available/Active Opensearch System==Verfügbare/Aktive OpenSearch Systeme
>Active<==>Aktiv<
>Title<==>Titel<
>Comment<==>Kommentar<
Url <small>\(format opensearch==URL <small>(format OpenSearch
Url template syntax==URL Template Syntax
>delete<==>Lösche<
>new<==>neu<
"add"=="Hinzufügen"
"Save"=="Speichern"
"reset to default list"=="Reset zur Standardliste"
"discover from index" class=="Discover vom Index" class
With the button "discover from index" you can search within the metadata of your local index \(Web Structure Index\) to find systems which support the Opensearch specification.==Mit dem Knopf "Discover vom Index" können Sie in den Metadaten Ihres lokalen Suchindexes (Web Struktur Index) suchen, um Systeme zu finden, die die OpenSearch Spezifikation unterstützen.
The task is started in the background. It may take some minutes before new entries appear \(after refreshing the page\).==Der Task wird im Hintergrund gestartet. Es kann einige Minuten dauern bevor neue Einträge erscheinen (nachdem die Seite erneut geladen wurde).
Alternatively you may==Alternativ können Sie
>copy &amp; paste a example config file<==>eine existierende Beispiel Konfiguration mit Copy &amp; Paste kopieren<
located in <i>defaults/heuristicopensearch.conf</i> to the DATA/SETTINGS directory.==von <i>defaults/heuristicopensearch.conf</i> ins Verzeichnis DATA/SETTINGS.
For the discover function the <i>web graph</i> option of the web structure index and the fields <i>target_rel_s, target_protocol_s, target_urlstub_s</i> have to be switched on in the <a href="IndexSchema_p.html\?core=webgraph">webgraph Solr schema</a>.==Für die Discover Funktion der <i>Web Graph</i> Option aus dem Web Struktur Index und den Feldern <i>target_rel_s, target_protocol_s, target_urlstub_s</i> müssen im <a href="IndexSchema_p.html?core=webgraph">Web Graph Solr Schema</a> angeschalten werden.
"switch Solr fields on"=="Schalte Solr Felder an"
\(\'modify Solr Schema\'\)==('Modifiziere Solr Schema')
#-----------------------------
#File: ConfigHTCache_p.html
@ -471,12 +495,14 @@ The cache is a rotating cache: if it is full, then the oldest entries are delete
HTCache Configuration==HTCache Konfiguration
The path where the cache is stored==Der Pfad an dem der Cache gespeichert wird
The current size of the cache==Die aktuelle Größe des Caches
\#\[actualCacheSize\]\# MB for \#\[actualCacheDocCount\]\# files, \#\[docSizeAverage\]\# KB / file in average==#[actualCacheSize]# MB für #[actualCacheDocCount]# Dateien, #[docSizeAverage]# KB / Datei im Durchschnitt
The maximum size of the cache==Die maximale Größe des Caches
"Set"=="Setzen"
Cleanup==Aufräumen
Cache Deletion==Cache Löschen
Delete HTTP &amp; FTP Cache==Lösche HTTP &amp; FTP Cache
Delete robots.txt Cache==Lösche robots.txt Cache
Delete cached snippet-fetching failures during search==Lösche gecachte Snippet-Hol-Fehler während der Suche
"Delete"=="Löschen"
#-----------------------------
@ -503,18 +529,40 @@ might overwrite existing data if a file of the same name exists already.==Achtun
#File: ConfigLiveSearch.html
#---------------------------
Integration of a Search Field for Live Search==Integration eines Suchfeldes für die Live Suche
Integration of Live Search with YaCy Search Widget==Integration der Livesuche mit dem YaCy Such-Widget
There are basically two methods for integrating the YaCy Search Widget with your web site.==Es gibt zwei Methoden, um das YaCy Such-Widget mit Ihrer Webseite zu integrieren.
Static hosting of widget on own HTTP server==Statisches Hosten des Widgets auf Ihrem eigenen HTTP Server
Remote access through selected YaCy Peer==Remote Access durch ausgewähltes YaCy Peer
Advantages:==Vorteile:
faster connection speed==Schnellere Verbindungsgeschwindigkeit
possibility for local adaptions==Möglichkeit von lokalen Anpassungen
Disadvantages:==Nachteile:
No automatic update to future releases of YaCy Search Widget==Kein automatisches Update auf zukünftige YaCy Suchwidgets
Ajax/JSONP cross domain requests needed to query remote YaCy Peer==AJAX/JSON Cross Domain Anfragen müssen den remote YaCy Peer abfragen
Installing:==Installieren:
download yacy-portalsearch.tar.gz from==Downloaden von yacy-portalsearch.tar.gz von
unpack within your HTTP servers path==Entpacken in das HTTP Server Verzeichnis
use ./yacy/portalsearch/yacy-portalsearch.html as reference for integration with your own portal page==Verwenden von ./yacy/portalsearch/yacy-portalsearch.html als Referenz für eine Integration mit Ihrem bestehenen Portal
#Remote access through selected YaCy Peer==Remote Access durch ausgewähltes YaCy Peer
#Advantages:==Vorteile:
Always latest version of YaCy Search Widget==Immer die aktuellste Version des YaCy Suchwidgets
No Ajax/JSONP cross domain requests, as Search Widget and YaCy Peer are hosted on the same domain.==Keine AJAX/JSON cross Domainanfragen, da Suchwidget und YaCy auf derselben Domäne gehostet werden.
Under certain cirumstances slower than static hosting==Unter bestimmten Umständen langsamer als statisches Hosting
Just use the code snippet below and paste it any place in your own portal page==Vewenden Sie einfach das Codesnippet unten und kopieren Sie es in Ihr eigenes Suchportal
Please check if '\#\[ip\]\#:\#\[port\]\#' is appropriate or replace it with address of the YaCy Peer holding your index==Bitte prüfen Sie, ob '#[ip]#:#[port]#' korrekt ist und ersetzen Sie es mit der Adresse des YaCy Peers dass die Konfiguration hat.
A \'Live-Search\' input field that reacts as search-as-you-type in a pop-up window can easily be integrated in any web page==Eine 'Live Suche' Eingabefeld zeigt live beim Eingeben in einem Pop-up Fenster Ergebnisse and und kann einfach in jede bestehende Webseite eingebaut werden
This is the same function as can be seen on all pages of the YaCy online-interface \(look at the window in the upper right corner\)==Das ist dieselbe Funktion, die man auf allen Seiten des YaCy Webinterfaces sehen kann (z.B. das Fenster in der oberen rechten Ecke)
Just use the code snippet below to integrate that in your own web pages==Verwenden Sie einfach den Code Ausschnitt unten, um das Suchfeld in Ihre Webseite einzubauen.
Please check if the address, as given in the example \'\#\[ip\]\#\:\#\[port\]\#\' here is correct and replace it with more appropriate values if necessary==Bitte überprüfen Sie, ob die Adresse die im Beispiel '#[ip]#:#[port]#' richtig ist und ersetzen Sie die Adresse wenn nötig mit richtigen Werten
Code Snippet:==Code Ausschnitt:
YaCy Portal Search==YaCy Portal Suche
#Just use the code snippet below to integrate that in your own web pages==Verwenden Sie einfach den Code Ausschnitt unten, um das Suchfeld in Ihre Webseite einzubauen.
Just use the code snippet below and paste it any place in your own portal page==Verwenden Sie einfach den Code Ausschnitt unten, um das Suchfeld auf Ihrem eigenen Webportal zu verwenden.
#Please check if the address, as given in the example \'\#\[ip\]\#\:\#\[port\]\#\' here is correct and replace it with more appropriate values if necessary==Bitte überprüfen Sie, ob die Adresse die im Beispiel '#[ip]#:#[port]#' richtig ist und ersetzen Sie die Adresse wenn nötig mit richtigen Werten
#Code Snippet:==Code Ausschnitt:
#YaCy Portal Search==YaCy Portal Suche
"Search"=="Suche"
Configuration options and defaults for \'yconf\':==Konfigurations Optionen und Standardeinstellungen für 'yconf':
Defaults<==Standardeinstellungen<
url<==URL<
is a mandatory property - no default<==muss angegeben werden<
YaCy P2P Web Search==YaCy P2P Web Suche
#is a mandatory property - no default<==muss angegeben werden<
#YaCy P2P Web Search==YaCy P2P Web Suche
Size and position \(width \| height \| position\)==Größe und Position (Breite | Höhe | Position)
Specifies where the dialog should be displayed. Possible values for position: \'center\', \'left\', \'right\', \'top\', \'bottom\', or an array containing a coordinate pair \(in pixel offset from top left of viewport\) or the possible string values \(e.g. \[\'right\',\'top\'\] for top right corner\)==Gibt an wo der Dialog angezeigt werden soll. Mögliche Werte für position: 'center', 'left', 'right', 'top', 'bottom', oder ein Array das ein Koordinatenpaar enthält (in Pixel Werten als Offset von der linken oberen Ecke des Viewports) oder einer möglichen String Variabble (e.g. ['right','top'] für die rechte obere Ecke)
Animation effects \(show | hide\)==Animationseffekte (show | hide)
@ -525,14 +573,12 @@ If modal is set to true, the dialog will have modal behavior; other items on the
Modal dialogs create an overlay below the dialog but above other page elements.==Modale Dialoge erzeugen einen Overlay unter dem Dialog aber überhalb anderer Seitenelemente
If resizable is set to true, the dialog will be resizeable.==Wenn resizable auf true gesetzt wird, kann man die Größe des Dialogfensters verändern.
Load JavaScript load_js==JavaScript laden load_js
If load_js is set to false, you have to manually load the needed JavaScript on your portal page.==Wenn load_js auf false gesetzt wird, müssen Sie manuell das benötigte Javascript auf Ihrer Portalseite laden.
This can help to avoid timing problems or double loading.==Das kann helfen Timing Probleme oder das doppelte Laden zu verhindern.
Load Stylesheets load_css==Stylesheets laden load_css
If load_css is set to false, you have to manually load the needed CSS on your portal page.==Wenn load_css auf false gesetzt wird, müssen Sie manuell das benötigte CSS auf Ihrer Portalseite laden.
#Themes==Themes
You can <==Sie können <
download</a> ready made themes or <a href=\"http://jqueryui.com/themeroller/\" target=\"_blank\">create</a>==fertige Themes herunterladen</a> oder <a href="http://jqueryui.com/themeroller/" target="_blank">erstellen</a>
your own custom theme. <br/>Themes are installed into: DATA/HTDOCS/yacy/ui/css/themes/==Ihr ganz eigenes Theme. <br/>Themes werden im folgenden Verzeichnis installiert: DATA/HTDOCS/yacy/ui/css/themes/
This parameter is used for static hosting only.==Dieser Parameter wird nur für das statische Hosting verwendet.
>Themes<==>Themen<
You can download standard jquery-ui themes or create your own custom themes on==Sie können Standard jQuery UI Themen herunterladen oder generieren Sie Ihr eigenes Thema auf
Themes are installed in ./yacy/jquery/themes/ \(static hosting\) or in DATA/HTDOCS/jquery/themes/ on remote YaCy Peer.==Themen sind installiert in ./yacy/jquery/themes/ (statisches Hosting) oder in DATA/HTDOCS/jquery/themes/ beim remote YaCy Peer.
YaCy ships with 'start' and 'smoothness' themes pre-installed.==YaCy kommt vorinstalliert mit 'start' und 'smoothness' Themen.
#-----------------------------
#File: ConfigNetwork_p.html
@ -613,32 +659,12 @@ Content Parser Settings==Inhalt Parser Einstellungen
With this settings you can activate or deactivate parsing of additional content-types based on their MIME-types.==Mit diesen Einstellungen können Sie das Parsen zusätzlicher Dateitypen basierend auf ihren MIME-Typen ermöglichen.
For a detailed description of the various MIME-types take a look at==Für eine detailierte Beschreibung der verschiedenen MIME-Typen können Sie einen Blick auf
http://www.iana.org/assignments/media-types/</a>==http://www.iana.org/assignments/media-types/</a> werfen.
enable/disable Parser==Parser aktiv / inaktiv
# --- Parser Names are hard-coded BEGIN ---
##Mime-Type==MIME Typ
##Microsoft Powerpoint Parser==Microsoft Powerpoint Parser
#Torrent Metadata Parser==Torrent Metadaten Parser
##HTML Parser==HTML Parser
#GNU Zip Compressed Archive Parser==GNU Zip Komprimiertes Archiv Parser
##Adobe Flash Parser==Adobe Flash Parser
#Word Document Parser==Word Dokument Parser
##vCard Parser==vCard Parser
#Bzip 2 UNIX Compressed File Parser==bzip2 UNIX Komprimierte Datei Parser
#OASIS OpenDocument V2 Text Document Parser==OASIS OpenDocument V2 Text Dokument Parser
##Microsoft Excel Parser==Microsoft Excel Parser
#ZIP File Parser==ZIP Datei Parser
##Rich Site Summary/Atom Feed Parser==Rich Site Summary / Atom Feed Parser
#Comma Separated Value Parser==Comma Separated Value (CSV) Parser
##Microsoft Visio Parser==Microsoft Visio Parser
#Tape Archive File Parser==Bandlaufwerk Archiv Datei Parser
#7zip Archive Parser==7zip Archiv Parser
##Acrobat Portable Document Parser==Adobe Acrobat Portables Dokument Format Parser
##Rich Text Format Parser==Rich Text Format Parser
#Generic Image Parser==Generischer Bild Parser
#PostScript Document Parser==PostScript Dokument Parser
#Open Office XML Document Parser==Open Office XML Dokument Parser
#BMP Image Parser==BMP Bild Parser
# --- Parser Names are hard-coded END ---
If you want to test a specific parser you can do so using the==
Wenn Sie einen bestimmten Parser testen wollen, verwenden Sie dafür den
>File Viewer<==>Datei Betrachter<
> enable/disable<==> aktiv / inaktiv<
>Extension<==>Erweiterung<
>Mime-Type<==>MIME-Typ<
"Submit"=="Speichern"
#-----------------------------
@ -655,23 +681,33 @@ Greeting Line<==Grußzeile<
URL of Home Page<==URL der Homepage<
URL of a Small Corporate Image<==URL des kleinen Corporate Identity Bildes<
URL of a Large Corporate Image<==URL des großen Corporate Identity Bildes<
Enable Search for Everyone\?==Suche für Jedermann aktivieren?
Search is available for everyone==Suche steht für Jedermann zur Verfügung
Only the administator is allowed to search==Nur der Administrator darf suchen
Show additional interaction features in footer==Zeige zusätzliche Features zur Interaktion in der Fußzeile
User-Logon==Benutzer-Logon
Snippet Fetch Strategy &amp; Link Verification==Snippet Fetch Strategie &amp; Link Verifikation
Speed up search results with this option! \(use CACHEONLY or FALSE to switch off verification\)==Beschleunige die Suchergebnisse mit dieser Option! (Verwende CACHEONLY oder FALSE, um die Verifikation abzuschalten)
NOCACHE: no use of web cache, load all snippets online==NOCACHE: Keine Verwendung des Webcache, alle Snippets online laden
IFFRESH: use the cache if the cache exists and is fresh otherwise load online==IFRESH: Verwende den Cache wenn er existiert und frisch ist und lade ansonsten online nach
IFEXIST: use the cache if the cache exist or load online==IFEXIST: Verwende den Cache wenn er existiert ansonsten lade online
If verification fails, delete index reference==Wenn die Überprüfung fehlschlägt lösche die Index Referenz
CACHEONLY: never go online, use all content from cache. If no cache entry exist, consider content nevertheless as available and show result without snippet==CACHEONLY: Gehe niemals online und verwende den ganzen Content aus dem Cache. Wenn kein Eintrag im Cache existiert betrachte den Kontent trotzdem als existent und zeige Resultate ohne Snippets.
FALSE: no link verification and not snippet generation: all search results are valid without verification==FALSE: Keine Linkprüfung und keine Snippet Generierung: Alle Suchergebnisse sind ohne Prüfung valide
Greedy Learning Mode==Gieriger Lernmodus
load documents linked in search results, will be deactivated automatically when index size==Lade Dokumente die in Suchergebnissen verlinkt sind. Wird automatisch deaktiviert wenn die Indexgröße
Show Navigation Bar on Search Page?==Navigationsleiste auf Suchseite anzeigen?
Show Navigation Top-Menu&nbsp;==Zeige Top-Menu Navigation&nbsp;
no link to YaCy Menu \(admin must navigate to /Status.html manually\)==kein Verweis auf YaCy Menu (Der Admin muss manuell auf /Status.html wechseln)
Show Advanced Search Options on Search Page?==Erweiterte Suchoptionen auf Suchseite anzeigen?
Show Advanced Search Options on index.html&nbsp;==Erweiterte Suchoptionen auf index.html anzeigen?
do not show Advanced Search==Erweiterte Suche nicht anzeigen
Show Information Links for each Search Result Entry==Informationslinks für jeden Suchergebnis Eintrag anzeigen
>Date&==>Datum&
>Size&==>Größe&
>Metadata&==>Metadaten&
>Parser&==>Parser&
>Pictures==>Bilder
Default Pop-Up Page<==Standard Pop-up<
>Status Page==>Status Seite
>Search Front Page==>Frontseite Suche
>Search Page \(small header\)==>Suchseite (kleine Kopfzeile)
>Interactive Search Page==>Interaktive Suchseite
Default maximum number of results per page==Standard Maximum der Suchergebnisse pro Seite
Default index.html Page \(by forwarder\)==Standard index.html Seite (durch Weiterleitung)
Target for Click on Search Results==Zielfenster beim Klicken auf ein Suchergebnis
@ -679,10 +715,20 @@ Target for Click on Search Results==Zielfenster beim Klicken auf ein Suchergebni
\"_self\" \(same window\)=="_self" (Im gleichen Fenster)
\"_parent\" \(the parent frame of a frameset\)=="_parent" (Der Parent Frame eines Framesets)
\"_top\" \(top of all frames\)=="_top" (Oberster Frame)
\"searchresult\" \(a default custom page name for search results\)=="Suchergebnis" (Eine standardmäßig konfigurierbare Setie für Suchergebnisse)
\"searchresult\" \(a default custom page name for search results\)=="Suchergebnis" (Eine standardmäßig konfigurierbare Seite für Suchergebnisse)
Special Target as Exception for an URL-Pattern==Spezial Ziel als Ausnahme für ein URL-Muster
Pattern:<==Muster:<
>Exclude Hosts<==>Hosts ausnehmen<
List of hosts that shall be excluded from search results by default but can be included using the site:&lt;host&gt; operator:==Liste aller Hosts die von standardmäßig von den Suchergebnissen ausgeschlossen werden sollen aber mit dem Seitenoperator: &lt;host&gt; wieder aufgenommen werden können.
\'About\' Column<br/>\(shown in a column alongside<br/>with the search result page\)=='Über' Spalte<br/>(wird in einer Spalte<br/>mit der Suchergebnisseite angezeigt)
\(Headline\)==(Kopfzeile)
\(Content\)==(Inhalt)
"Change Search Page"=="Ändere die Suchseite"
"Set to Default Values"=="Standardwerte setzen"
You have ==Sie müssen
set a remote user/password==einen Remote Benutzer mit Passwort anlegen
to change this options.==, um diese Option zu ändern.
The search page can be integrated in your own web pages with an iframe. Simply use the following code:==Die Suchseite kann mit einem iframe in Ihre eigenen Webseiten eingebaut werden. Verwenden Sie dazu den folgenden Code:
This would look like:==Das würde so aussehen:
For a search page with a small header, use this code:==Für eine Suchseite mit kleiner Kopfzeile verwenden Sie folgenden Code:
@ -716,6 +762,7 @@ Here are all configuration options from YaCy.==Hier sind alle Konfigurationseins
You can change anything, but some options need a restart, and some options can crash YaCy, if wrong values are used.==Sie können alles ändern, bedenken Sie aber, dass einige Änderungen einen Neustart von YaCy erfordern und andere Änderungen YaCy ganz zum Absturz bringen können.
For explanation please look into defaults/yacy.init==Eine Erklärung finden Sie in der Datei defaults/yacy.init
"Save"=="Speichern"
"Clear"=="Leeren"
#-----------------------------
#File: ConfigRobotsTxt_p.html
@ -758,6 +805,9 @@ Replace the word \"MySearch\" with your own message==Ersetzen Sie das Wort "Mein
Manual System Update==Manuelle Systemaktualisierung
Current installed Release==Aktuell installierte Version
Available Releases==Verfügbare Versionen
>changelog<==>Changelog<
> and <==> und <
> RSS feed<==> RSS Feed<
\(unsigned\)==(unsigniert)
\(signed\)==(signiert)
"Download Release"=="Version Herunterladen"
@ -1259,6 +1309,8 @@ Confirm Deletion==Löschen Bestätigen
Count Colors:==Legende der farbigen Zahlen:
Documents without Errors==Dokumente mit Fehlern
Pending in Crawler==Warten im Crawler
Crawler Excludes<==Crawler Ausnahmen<
Load Errors<==Ladefehler<
#Load Errors \(exclusion/failure\)==Ladefehler (Ausschluss/Fehler)
#Browser for \#\[path\]\#==Browser für #[path]#
documents stored for host: \#\[hostsize\]\#==Gespeicherte Dokumente für Host: #[hostsize]#
@ -1282,6 +1334,15 @@ Inbound Links, incoming to \#\[host\]\# - Host List==Eingehende Links, eingehend
#Administration Options==Administration Optionen
#-----------------------------
#File: HostBrowserAdmin_p.html
#---------------------------
Administration Options==Administrator Optionen
Delete all==Lösche alle
>Load Errors<==>Ladefehler<
from index==vom Index
"Delete Load Errors"=="Lösche alle Ladefehler"
#-----------------------------
#File: index.html
#---------------------------
YaCy \'\#\[clientname\]\#\': Search Page==YaCy '#[clientname]#': Suchseite
@ -2890,6 +2951,17 @@ You can now go back to the <a href="Settings_p.html">Settings</a> page if you wa
See you soon!==Bis bald!
Just a moment, please!==Einen Moment bitte!
Application will terminate after working off all scheduled tasks.==YaCy Proxy wird beendet, nachdem alle ausstehenden Aufgaben abgearbeitet wurden.
Please send us feed-back!==Bitte senden Sie uns Feedback!
We don't track YaCy users, YaCy does not send \'home-pings\', we do not even know how many people use YaCy as their private search engine.==Wir tracken YaCy Benutzer nicht. YaCy sendet keine 'Pings nach Hause'. Wir wissen noch nicht einmal wie viele Menschen YaCy als ihre private Suchmaschine verwenden.
Therefore we like to ask you: do you like YaCy\? Will you use it again... if not, why\? Is is possible that we change a bit to suit your needs\?==Darum wollen wir Sie fragen: Mögen Sie YaCy? Werden sie es wieder verwenden ... wenn nicht ... Warum? Ist es möglich dass wir uns verändern, um uns Ihren Anforderungen anzupassen?
Please send us feed-back about your experience with an==Bitte senden Sie uns Ihr Feedback mit einer
>anonymous message<==>anonymen Nachricht<
or a<==oder einem<
posting to our==Posting auf unserem
web forums==Web Forum
>bug report<==>Bug Report<
>Professional Support<==>Professioneller Support<
If you are a professional user and you would like to use YaCy in your company in combination with consulting services by YaCy specialists, please see==Wenn Sie ein professioneller Anwender sind und YaCy in Ihrer Firma in Kombination mit Beratungsservices eines YaCy Spezialisten einsetzen wollen, besuchen Sie
Then YaCy will restart.==Dann wird YaCy neu gestartet.
If you can't reach YaCy's interface after 5 minutes restart failed.==Wenn Sie nach 5 Minuten nicht auf das YaCy Interface zugreifen können, dann ist der Neustart fehlgeschlagen.
Installing release==Installiere Release
@ -3187,10 +3259,7 @@ You can edit your profile <a href="ConfigProfile_p.html">here</a>==Sie können I
#File: Crawler_p.html
#---------------------------
Crawler Queues==Crawler Puffer
PPM \(Pages Per Minute\)==PPM (Seiten pro Minute)
#Traffic \(Crawler\)==Traffic (Crawler)
RWI RAM \(Word Cache\)==RWI RAM (Wörter Zwischenspeicher)
#Crawler==Crawler
Error with profile management. Please stop YaCy, delete the file DATA/PLASMADB/crawlProfiles0.db==Fehler im Profil Management. Bitte stoppen Sie YaCy, löschen Sie die Datei DATA/PLASMADB/crawlProfiles0.db
and restart.==und starten Sie YaCy neu.
Error:==Fehler:
@ -3207,37 +3276,31 @@ Error with file input==Fehler mit Datei Eingabe
started.==gestartet.
Please wait some seconds,==Bitte warten Sie einige Sekunden,
it may take some seconds until the first result appears there.==es kann einige Sekunden dauern, bis hier die ersten Ergebnisse zu sehen sind.
If you crawl any un-wanted pages, you can delete them <a href="IndexCreateWWWLocalQueue_p.html">here</a>.==Wenn Ihr Crawl Seiten indexiert, die Sie nicht indexiert haben wollen, können Sie diese <a href="IndexCreateWWWLocalQueue_p.html">hier</a> löschen.
Crawl Queue:==Crawler Puffer:
Queue</th>==Puffer</th>
Profile</th>==Profil</th>
Initiator==Auftraggeber
Depth</th>==Tiefe</th>
Modified Date==Änderungsdatum
Anchor Name==Linktitel
#URL==URL
Delete==Löschen
Next update in==Nächste Aktualisierung in
/> seconds.==/> Sekunden.
See a access timing <a href="api/latency_p.xml">here</a>==<a href="api/latency_p.xml">Hier</a> gibt es eine Tabelle mit Latenz- und Zugriffszeiten
Queue</th>==Puffer</th>
>Size==>Größe
>Progress<==>Fortschritt<
#Max==Max
"set"=="Setzen"
#Indexing</td>==Indexieren</td>
Loader==Lader
>Index Size<==>Index Größe<
Seg-<br/>ments==Seg-<br/>mente
>Documents<==>Documente<
>solr search api<==>Solr Such-API<
>Webgraph Edges<==>Webgraph Kanten<
Citations<br/>\(reverse link index\)==Citations<br/>(Rückwärts Such Index)
RWIs<br/>\(P2P Chunks\)==RWIs<br/>(P2P Anteile)<
Local Crawler==Lokaler Crawler
unlimited==uneingeschränkt
#Remote Crawler==Remote Crawler
Speed==Geschwindigkeit
"minimum"=="Minimum"
"custom"=="benutzerdefiniert"
"maximum"=="Maximum"
Limit Crawler==Limitierter Crawler
Remote Crawler==Entfernter Crawler
No-Load Crawler==Keine-Ladung Crawler
Speed / PPM<br/>\(Pages Per Minute\)==Geschwindigkeit / PPM<br/>\(PPM = Seiten pro Minute)
Database==Datenbank
Entries==Einträge
Pages \(URLs\)==Seiten (URLs)
RWIs \(Words\)==RWIs (Wörter)
Indicator==Indikator
Level==Stufe
Postprocessing Progress==Posprozess Fortschritt
Traffic \(Crawler\)==Daten-Traffic (Crawler)
Load<==Auslastung<
#-----------------------------
#File: WatchWebStructure_p.html
@ -3532,7 +3595,7 @@ Blacklist Administration==Blacklist Verwaltung
Blacklist Cleaner==Blacklist aufräumen
Blacklist Test==Blacklist testen
Import/Export==Import / Export
Index Cleaner==Index Aufräumer
Content Control==Inhaltskontrolle
#-----------------------------
#File: env/templates/submenuConfig.template
@ -3542,15 +3605,13 @@ Peer Administration Console==Peer Administrations Konsole
Basic Configuration==Eingangskonfiguration
>Accounts==>Konten
Network Configuration==Netzwerk Konfiguration
>Heuristics<==>Heuristik<
Dictionary Loader==Wörterbuch Download
System Update==System Aktualisierung
Download System Update==Download System Aktualisierung
>Performance==>Leistung
Advanced Settings==Erweiterte Einstellungen
Parser Configuration==Parser Einstellungen
Local robots.txt==Lokale robots.txt
#Web Cache==Web Cache
Advanced Properties==Erweiterte Konfiguration
#>Thread Dump<==>Thread Dump<
#-----------------------------
#File: env/templates/submenuContentIntegration.template
@ -3579,7 +3640,6 @@ Local Crawling</a>==Lokales Crawlen</a>
Global Crawling</a>==Globales Crawlen</a>
Surrogate Import</a>==Surrogat Import</a>
Processing Monitor==Prozess Monitor
Crawler Queues==Crawler Puffer
Loader<==Lade Puffer<
Rejected URLs==Abgelehnte URLs
>Queues<==>Warteschlangen<
@ -3602,9 +3662,16 @@ User Profile==Benutzerprofil
#File: env/templates/submenuIndexControl.template
#---------------------------
Index Administration==Indexverwaltung
Reverse Word Index Administration==Reverse Wort Indexverwaltung
URL References Database==URL Referenzen Datenbank
URL Viewer==URL Betrachter
URL Database Administration==URL Datenbank Administration
Index Deletion==Index Löschung
Index Sources &amp; Targets==Index Quellen &amp; Ziele
#Solr Schema Editor==Solr Schema Editor
Field Re-Indexing==Feld Re-Indexierung
Reverse Word Index==Reverse Wort Index
Index Cleaner==Index Reiniger
Content Analysis==Kontentanalyse
Web Cache==Webcache
Parser Configuration==Parser Konfiguration
#-----------------------------
#File: env/templates/submenuIndexCreate.template
@ -3663,6 +3730,26 @@ Publication==Veröffentlichung
File Hosting==Datei Freigabe
#-----------------------------
#File: env/templates/submenuSearchConfiguration.template
#---------------------------
Integrated Search Configuration==Integrierte Suchkonfiguration
Generic Search Portal==Generisches Suchportal
Search Page Layout==Suchseite Layout
Appearance==Darstellung
Language==Sprache
User Profile==Benutzerprofil
Heuristics==Heuristik
Solr Ranking Config==Solr Ranking Konfiguration
RWI Ranking Config==RWI Ranking Konfiguration
#-----------------------------
#File: env/templates/submenuSearchIntegration.template
#---------------------------
Search Integration into External Sites==Integration der Suche in Externe Seiten
Live Search Anywhere==Live Suche Überall
Search Box Anywhere==Suchbox Überall
#-----------------------------
#File: env/templates/submenuViewLog.template
#---------------------------
Server Log Menu==Server Log Menü

@ -121,10 +121,7 @@ change_admin_password()
echo 'Entries did not match, please try again.'
change_admin_password
else
BASE64=`$JAVA -classpath lib/yacycore.jar net.yacy.cora.order.Base64Order -es "$USERNAME:$INPUT1"`
B64MD5=`$JAVA -classpath lib/yacycore.jar net.yacy.cora.order.Digest -strfhex "$BASE64"`
B64MD5=`echo $B64MD5 | sed "s/\(\S\) .*/\1/"`
replace_parameter 'adminAccountBase64MD5' "$B64MD5"
replace_parameter 'adminAccount' "$USERNAME:$INPUT1"
fi
STATUS='Admin password has been changed.'
@ -272,8 +269,7 @@ read_parameter()
# REPLACES THE VALUE OF A PARAMETER (FIRST ARGUMENT) WITH A NEW ONE (SECOND ARGUMENT)
replace_parameter()
{
sed "s/^\($1 *=\)\(.*\)/\1$2/" "$CONFIGFILE" >"$SETTINGSDIR/yacy.tmp"
mv "$SETTINGSDIR/yacy.tmp" "$CONFIGFILE"
$JAVA -classpath lib/yacycore.jar net.yacy.yacy -config "$1=$2" 2>/dev/null
}
#

@ -31,6 +31,7 @@ import java.util.Map;
import java.util.Set;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrDocumentList;
import org.apache.solr.common.SolrInputDocument;
import org.apache.solr.common.SolrInputField;
@ -157,8 +158,9 @@ public class SchemaConfiguration extends Configuration implements Serializable {
continue uniquecheck;
}
try {
final SolrDocument doc = segment.fulltext().getDefaultConnector().getDocumentById(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\"");
if (doc != null) {
final SolrDocumentList docs = segment.fulltext().getDefaultConnector().getDocumentListByQuery(CollectionSchema.host_id_s + ":\"" + hostid + "\" AND " + signaturefield.getSolrFieldName() + ":\"" + checkhash.toString() + "\"", 0, 1);
if (docs != null && !docs.isEmpty()) {
SolrDocument doc = docs.get(0);
// switch unique attribute in new document
sid.setField(uniquefield.getSolrFieldName(), false);
// switch attribute in existing document

@ -82,24 +82,32 @@ public abstract class AbstractSolrConnector implements SolrConnector {
}
protected final static int pagesize = 100;
protected static long getLoadDate(final Object doc) {
protected static Metadata getMetadata(final Object doc) {
if (doc == null) return null;
Object d = null;
if (doc != null) {
if (doc instanceof SolrInputDocument) d = ((SolrInputDocument) doc).getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName());
if (doc instanceof SolrDocument) d = ((SolrDocument) doc).getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName());
if (doc instanceof org.apache.lucene.document.Document) {
String ds = ((org.apache.lucene.document.Document) doc).get(CollectionSchema.load_date_dt.getSolrFieldName());
try {
d = Long.parseLong(ds);
} catch (NumberFormatException e) {
d = -1l;
}
String url = null;
if (doc instanceof SolrInputDocument) {
d = ((SolrInputDocument) doc).getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName());
url = (String) ((SolrInputDocument) doc).getFieldValue(CollectionSchema.sku.getSolrFieldName());
}
if (doc instanceof SolrDocument) {
d = ((SolrDocument) doc).getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName());
url = (String) ((SolrDocument) doc).getFieldValue(CollectionSchema.sku.getSolrFieldName());
}
if (doc instanceof org.apache.lucene.document.Document) {
String ds = ((org.apache.lucene.document.Document) doc).get(CollectionSchema.load_date_dt.getSolrFieldName());
try {
d = Long.parseLong(ds);
} catch (NumberFormatException e) {
d = -1l;
}
url = ((org.apache.lucene.document.Document) doc).get(CollectionSchema.sku.getSolrFieldName());
}
if (d == null) return -1l;
if (d instanceof Long) return ((Long) d).longValue();
if (d instanceof Date) return ((Date) d).getTime();
return -1l;
if (d == null) return null;
long date = -1;
if (d instanceof Long) date = ((Long) d).longValue();
if (d instanceof Date) date = ((Date) d).getTime();
return new Metadata(url, date);
}
/**
@ -239,11 +247,11 @@ public abstract class AbstractSolrConnector implements SolrConnector {
/**
* check if a given document, identified by url hash as document id exists
* @param id the url hash and document id
* @return the load date if any entry in solr exists, -1 otherwise
* @return metadata if any entry in solr exists, null otherwise
* @throws IOException
*/
@Override
public long getLoadTime(String id) throws IOException {
public Metadata getMetadata(String id) throws IOException {
// construct raw query
final SolrQuery params = new SolrQuery();
//params.setQuery(CollectionSchema.id.getSolrFieldName() + ":\"" + id + "\"");
@ -253,15 +261,15 @@ public abstract class AbstractSolrConnector implements SolrConnector {
params.setStart(0);
params.setFacet(false);
params.clearSorts();
params.setFields(CollectionSchema.id.getSolrFieldName(), CollectionSchema.load_date_dt.getSolrFieldName());
params.setFields(CollectionSchema.id.getSolrFieldName(), CollectionSchema.sku.getSolrFieldName(), CollectionSchema.load_date_dt.getSolrFieldName());
params.setIncludeScore(false);
// query the server
final SolrDocumentList sdl = getDocumentListByParams(params);
if (sdl == null || sdl.getNumFound() <= 0) return -1;
if (sdl == null || sdl.getNumFound() <= 0) return null;
SolrDocument doc = sdl.iterator().next();
long d = getLoadDate(doc);
return d;
Metadata md = getMetadata(doc);
return md;
}
/**

@ -31,7 +31,9 @@ import java.util.concurrent.LinkedBlockingQueue;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.storage.ARC;
import net.yacy.cora.storage.ARH;
import net.yacy.cora.storage.ConcurrentARC;
import net.yacy.cora.storage.ConcurrentARH;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.search.schema.CollectionSchema;
@ -66,7 +68,7 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
try {
removeIdFromUpdateQueue(id);
ConcurrentUpdateSolrConnector.this.connector.deleteById(id);
ConcurrentUpdateSolrConnector.this.idCache.remove(id);
ConcurrentUpdateSolrConnector.this.metadataCache.remove(id);
} catch (final IOException e) {
ConcurrentLog.logException(e);
}
@ -89,8 +91,8 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
Collection<SolrInputDocument> docs = new ArrayList<SolrInputDocument>(getmore + 1);
docs.add(doc);
String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
long date = AbstractSolrConnector.getLoadDate(doc);
updateIdCache(id, date);
Metadata md = AbstractSolrConnector.getMetadata(doc);
updateCache(id, md);
for (int i = 0; i < getmore; i++) {
SolrInputDocument d = ConcurrentUpdateSolrConnector.this.updateQueue.take();
if (d == POISON_DOCUMENT) {
@ -99,8 +101,8 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
}
docs.add(d);
id = (String) d.getFieldValue(CollectionSchema.id.getSolrFieldName());
date = AbstractSolrConnector.getLoadDate(d);
updateIdCache(id, date);
md = AbstractSolrConnector.getMetadata(d);
updateCache(id, md);
}
//ConcurrentLog.info("ConcurrentUpdateSolrConnector", "sending " + docs.size() + " documents to solr");
try {
@ -112,8 +114,8 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
// if there is only a single document, send this directly to solr
//ConcurrentLog.info("ConcurrentUpdateSolrConnector", "sending one document to solr");
String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
long date = AbstractSolrConnector.getLoadDate(doc);
updateIdCache(id, date);
Metadata md = AbstractSolrConnector.getMetadata(doc);
updateCache(id, md);
try {
ConcurrentUpdateSolrConnector.this.connector.add(doc);
} catch (final OutOfMemoryError e) {
@ -134,15 +136,17 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
}
}
}
private ARC<String, Long> idCache;
private ARC<String, Metadata> metadataCache;
private ARH<String> missCache;
private BlockingQueue<SolrInputDocument> updateQueue;
private BlockingQueue<String> deleteQueue;
private Thread deletionHandler, updateHandler;
public ConcurrentUpdateSolrConnector(SolrConnector connector, int updateCapacity, int idCacheCapacity, int concurrency) {
this.connector = connector;
this.idCache = new ConcurrentARC<String, Long>(idCacheCapacity, concurrency); // url hash to load time
this.metadataCache = new ConcurrentARC<String, Metadata>(idCacheCapacity, concurrency);
this.missCache = new ConcurrentARH<String>(idCacheCapacity, concurrency);
this.updateQueue = new ArrayBlockingQueue<SolrInputDocument>(updateCapacity);
this.deleteQueue = new LinkedBlockingQueue<String>();
this.deletionHandler = null;
@ -159,7 +163,8 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
@Override
public void clearCaches() {
this.connector.clearCaches();
this.idCache.clear();
this.metadataCache.clear();
this.missCache.clear();
}
/**
@ -192,16 +197,18 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
return null;
}
private long existIdFromUpdateQueue(String id) {
if (this.updateQueue.size() == 0) return -1;
private Metadata existIdFromUpdateQueue(String id) {
if (this.updateQueue.size() == 0) return null;
Iterator<SolrInputDocument> i = this.updateQueue.iterator();
while (i.hasNext()) {
SolrInputDocument doc = i.next();
if (doc == null) break;
String docID = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
if (docID != null && docID.equals(id)) return AbstractSolrConnector.getLoadDate(doc);
if (docID != null && docID.equals(id)) {
return AbstractSolrConnector.getMetadata(doc);
}
}
return -1;
return null;
}
private void removeIdFromUpdateQueue(String id) {
@ -231,10 +238,14 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
}
}
private void updateIdCache(String id, long time) {
private void updateCache(final String id, final Metadata md) {
if (id == null) return;
if (MemoryControl.shortStatus()) this.idCache.clear();
this.idCache.put(id, time);
if (MemoryControl.shortStatus()) {
this.metadataCache.clear();
this.missCache.clear();
}
this.metadataCache.put(id, md);
this.missCache.delete(id);
}
public void ensureAliveDeletionHandler() {
@ -305,9 +316,9 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
try {this.deletionHandler.join();} catch (final InterruptedException e) {}
try {this.updateHandler.join();} catch (final InterruptedException e) {}
this.connector.close();
this.idCache.clear();
this.metadataCache.clear();
this.connector = null;
this.idCache = null;
this.metadataCache = null;
}
@Override
@ -317,13 +328,14 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
try {this.updateQueue.put(POISON_DOCUMENT);} catch (final InterruptedException e) {}
try {this.updateHandler.join();} catch (final InterruptedException e) {}
this.connector.clear();
this.idCache.clear();
this.metadataCache.clear();
}
@Override
public void deleteById(String id) throws IOException {
public synchronized void deleteById(String id) throws IOException {
removeIdFromUpdateQueue(id);
this.idCache.remove(id);
this.metadataCache.remove(id);
this.missCache.add(id);
if (this.deletionHandler.isAlive()) {
try {this.deleteQueue.put(id);} catch (final InterruptedException e) {}
} else {
@ -332,10 +344,11 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
}
@Override
public void deleteByIds(Collection<String> ids) throws IOException {
public synchronized void deleteByIds(Collection<String> ids) throws IOException {
for (String id: ids) {
removeIdFromUpdateQueue(id);
this.idCache.remove(id);
this.metadataCache.remove(id);
this.missCache.add(id);
}
if (this.deletionHandler.isAlive()) {
for (String id: ids) try {this.deleteQueue.put(id);} catch (final InterruptedException e) {}
@ -346,40 +359,35 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
@Override
public void deleteByQuery(final String querystring) throws IOException {
//new Thread() {
// public void run() {
ConcurrentUpdateSolrConnector.this.idCache.clear();
try {
ConcurrentUpdateSolrConnector.this.connector.deleteByQuery(querystring);
ConcurrentUpdateSolrConnector.this.idCache.clear();
} catch (final IOException e) {
ConcurrentLog.severe("ConcurrentUpdateSolrConnector", e.getMessage(), e);
}
ConcurrentUpdateSolrConnector.this.connector.commit(true);
// }
//}.start();
try {
ConcurrentUpdateSolrConnector.this.connector.deleteByQuery(querystring);
ConcurrentUpdateSolrConnector.this.metadataCache.clear();
ConcurrentUpdateSolrConnector.this.missCache.clear();
} catch (final IOException e) {
ConcurrentLog.severe("ConcurrentUpdateSolrConnector", e.getMessage(), e);
}
ConcurrentUpdateSolrConnector.this.connector.commit(true);
}
@Override
public long getLoadTime(String id) throws IOException {
Long date = this.idCache.get(id);
if (date != null) {cacheSuccessSign(); return date.longValue();}
if (existIdFromDeleteQueue(id)) {cacheSuccessSign(); return -1;}
long d = existIdFromUpdateQueue(id);
if (d >= 0) {cacheSuccessSign(); return d;}
d = this.connector.getLoadTime(id);
if (d >= 0) {
updateIdCache(id, d);
return d;
}
return -1;
public Metadata getMetadata(String id) throws IOException {
if (this.missCache.contains(id)) {cacheSuccessSign(); return null;}
Metadata md = this.metadataCache.get(id);
if (md != null) {cacheSuccessSign(); return md;}
if (existIdFromDeleteQueue(id)) {cacheSuccessSign(); return null;}
md = existIdFromUpdateQueue(id);
if (md != null) {cacheSuccessSign(); return md;}
md = this.connector.getMetadata(id);
if (md == null) {this.missCache.add(id); return null;}
updateCache(id, md);
return md;
}
@Override
public void add(SolrInputDocument solrdoc) throws IOException, SolrException {
String id = (String) solrdoc.getFieldValue(CollectionSchema.id.getSolrFieldName());
removeIdFromDeleteQueue(id);
updateIdCache(id, AbstractSolrConnector.getLoadDate(solrdoc));
updateCache(id, AbstractSolrConnector.getMetadata(solrdoc));
if (this.updateHandler.isAlive()) {
try {this.updateQueue.put(solrdoc);} catch (final InterruptedException e) {}
} else {
@ -392,7 +400,7 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
for (SolrInputDocument doc: solrdocs) {
String id = (String) doc.getFieldValue(CollectionSchema.id.getSolrFieldName());
removeIdFromDeleteQueue(id);
updateIdCache(id, AbstractSolrConnector.getLoadDate(doc));
updateCache(id, AbstractSolrConnector.getMetadata(doc));
}
if (this.updateHandler.isAlive()) {
for (SolrInputDocument doc: solrdocs) try {this.updateQueue.put(doc);} catch (final InterruptedException e) {}
@ -403,11 +411,16 @@ public class ConcurrentUpdateSolrConnector implements SolrConnector {
@Override
public SolrDocument getDocumentById(final String id, String... fields) throws IOException {
if (this.missCache.contains(id)) return null;
if (existIdFromDeleteQueue(id)) return null;
SolrInputDocument idoc = getFromUpdateQueue(id);
if (idoc != null) {cacheSuccessSign(); return ClientUtils.toSolrDocument(idoc);}
SolrDocument doc = this.connector.getDocumentById(id, AbstractSolrConnector.ensureEssentialFieldsIncluded(fields));
if (doc != null) updateIdCache(id, AbstractSolrConnector.getLoadDate(doc));
if (doc == null) {
this.missCache.add(id);
} else {
updateCache(id, AbstractSolrConnector.getMetadata(doc));
}
return doc;
}

@ -396,24 +396,24 @@ public class EmbeddedSolrConnector extends SolrServerConnector implements SolrCo
* @throws IOException
*/
@Override
public synchronized long getLoadTime(String id) {
public synchronized Metadata getMetadata(String id) {
int responseCount = 0;
DocListSearcher docListSearcher = null;
try {
docListSearcher = new DocListSearcher("{!raw f=" + CollectionSchema.id.getSolrFieldName() + "}" + id, 0, 1, CollectionSchema.id.getSolrFieldName(), CollectionSchema.load_date_dt.getSolrFieldName());
responseCount = docListSearcher.response.size();
if (responseCount == 0) return -1;
if (responseCount == 0) return null;
SolrIndexSearcher searcher = docListSearcher.request.getSearcher();
DocIterator iterator = docListSearcher.response.iterator();
//for (int i = 0; i < responseCount; i++) {
Document doc = searcher.doc(iterator.nextDoc(), AbstractSolrConnector.SOLR_ID_and_LOAD_DATE_FIELDS);
if (doc == null) return -1;
return AbstractSolrConnector.getLoadDate(doc);
if (doc == null) return null;
return AbstractSolrConnector.getMetadata(doc);
//}
} catch (Throwable e) {} finally {
if (docListSearcher != null) docListSearcher.close();
}
return -1;
return null;
}
@Override

@ -394,19 +394,19 @@ public class MirrorSolrConnector extends AbstractSolrConnector implements SolrCo
}
@Override
public long getLoadTime(String id) throws IOException {
if (this.solr0 != null && this.solr1 == null) return this.solr0.getLoadTime(id);
if (this.solr0 == null && this.solr1 != null) return this.solr1.getLoadTime(id);
if (this.solr0 == null && this.solr1 == null) return -1;
return Math.max(this.solr0.getLoadTime(id), this.solr1.getLoadTime(id));
public Metadata getMetadata(String id) throws IOException {
if (this.solr0 != null && this.solr1 == null) return this.solr0.getMetadata(id);
if (this.solr0 == null && this.solr1 != null) return this.solr1.getMetadata(id);
if (this.solr0 == null && this.solr1 == null) return null;
Metadata md0 = this.solr0.getMetadata(id);
Metadata md1 = this.solr1.getMetadata(id);
if (md0 == null) return md1;
if (md1 == null) return md0;
long date = Math.max(md0.date, md1.date);
assert md0.url.equals(md1.url);
return new Metadata(md0.url, date);
}
/*
@Override
public BlockingQueue<SolrDocument> concurrentDocumentsByQuery(String querystring, int offset, int maxcount, long maxtime, int buffersize, String... fields) {
return null;
}
*/
@Override
public BlockingQueue<String> concurrentIDsByQuery(String querystring, int offset, int maxcount, long maxtime) {
if (this.solr0 != null && this.solr1 == null) return this.solr0.concurrentIDsByQuery(querystring, offset, maxcount, maxtime);

@ -36,6 +36,15 @@ import org.apache.solr.common.params.ModifiableSolrParams;
public interface SolrConnector extends Iterable<String> /* Iterable of document IDs */ {
public static class Metadata {
public long date;
public String url;
public Metadata(final String url, final long date) {
this.url = url;
this.date = date;
}
}
/**
* clear all caches: inside solr and ouside solr within the implementations of this interface
*/
@ -110,11 +119,11 @@ public interface SolrConnector extends Iterable<String> /* Iterable of document
/**
* check if a given document, identified by url hash as document id exists
* @param id the url hash and document id
* @return the load time if any entry in solr exists, -1 otherwise
* @return the metadata (url and load data) if any entry in solr exists, null otherwise
* @throws IOException
*/
public long getLoadTime(final String id) throws IOException;
public Metadata getMetadata(final String id) throws IOException;
/**
* add a solr input document
* @param solrdoc

@ -161,7 +161,7 @@ public class InstanceMirror {
if (msc != null) return msc;
EmbeddedSolrConnector esc = getEmbeddedConnector(corename);
RemoteSolrConnector rsc = getRemoteConnector(corename);
msc = new ConcurrentUpdateSolrConnector(new MirrorSolrConnector(esc, rsc), 100, 100000, Runtime.getRuntime().availableProcessors());
msc = new ConcurrentUpdateSolrConnector(new MirrorSolrConnector(esc, rsc), RemoteInstance.queueSizeByMemory(), 100000, Runtime.getRuntime().availableProcessors());
this.mirrorConnectorCache.put(corename, msc);
return msc;
}

@ -31,6 +31,7 @@ import java.util.Map;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.search.schema.CollectionSchema;
import net.yacy.search.schema.WebgraphSchema;
@ -49,7 +50,7 @@ import org.apache.http.client.entity.GzipDecompressingEntity;
import org.apache.http.impl.auth.BasicScheme;
import org.apache.http.protocol.HttpContext;
import org.apache.solr.client.solrj.SolrServer;
import org.apache.solr.client.solrj.impl.HttpSolrServer;
import org.apache.solr.client.solrj.impl.ConcurrentUpdateSolrServer;
@SuppressWarnings("deprecation")
public class RemoteInstance implements SolrInstance {
@ -57,9 +58,9 @@ public class RemoteInstance implements SolrInstance {
private String solrurl;
private final Object client; // not declared as org.apache.http.impl.client.DefaultHttpClient to avoid warnings during compilation. TODO: switch to org.apache.http.impl.client.HttpClientBuilder
private final String defaultCoreName;
private final HttpSolrServer defaultServer;
private final ConcurrentUpdateSolrServer defaultServer;
private final Collection<String> coreNames;
private final Map<String, HttpSolrServer> server;
private final Map<String, ConcurrentUpdateSolrServer> server;
private final int timeout;
public static ArrayList<RemoteInstance> getShardInstances(final String urlList, Collection<String> coreNames, String defaultCoreName, final int timeout) throws IOException {
@ -75,7 +76,7 @@ public class RemoteInstance implements SolrInstance {
public RemoteInstance(final String url, final Collection<String> coreNames, final String defaultCoreName, final int timeout) throws IOException {
this.timeout = timeout;
this.server= new HashMap<String, HttpSolrServer>();
this.server= new HashMap<String, ConcurrentUpdateSolrServer>();
this.solrurl = url == null ? "http://127.0.0.1:8983/solr/" : url; // that should work for the example configuration of solr 4.x.x
this.coreNames = coreNames == null ? new ArrayList<String>() : coreNames;
if (this.coreNames.size() == 0) {
@ -178,7 +179,7 @@ public class RemoteInstance implements SolrInstance {
this.client = null;
}
this.defaultServer = (HttpSolrServer) getServer(this.defaultCoreName);
this.defaultServer = (ConcurrentUpdateSolrServer) getServer(this.defaultCoreName);
if (this.defaultServer == null) throw new IOException("cannot connect to url " + url + " and connect core " + defaultCoreName);
}
@ -211,7 +212,7 @@ public class RemoteInstance implements SolrInstance {
@Override
public SolrServer getServer(String name) {
// try to get the server from the cache
HttpSolrServer s = this.server.get(name);
ConcurrentUpdateSolrServer s = this.server.get(name);
if (s != null) return s;
// create new http server
if (this.client != null) {
@ -226,14 +227,14 @@ public class RemoteInstance implements SolrInstance {
String solrpath = u.getPath();
String p = "http://" + host + ":" + port + solrpath;
ConcurrentLog.info("RemoteSolrConnector", "connecting Solr authenticated with url:" + p);
s = new HttpSolrServer(p, ((org.apache.http.impl.client.DefaultHttpClient) this.client));
s = new ConcurrentUpdateSolrServer(p, ((org.apache.http.impl.client.DefaultHttpClient) this.client), 10, Runtime.getRuntime().availableProcessors());
} else {
ConcurrentLog.info("RemoteSolrConnector", "connecting Solr with url:" + this.solrurl + name);
s = new HttpSolrServer(this.solrurl + name);
s = new ConcurrentUpdateSolrServer(this.solrurl + name, queueSizeByMemory(), Runtime.getRuntime().availableProcessors());
}
s.setAllowCompression(true);
s.setConnectionTimeout(this.timeout);
s.setMaxRetries(1); // Solr-Doc: No more than 1 recommended (depreciated)
//s.setAllowCompression(true);
s.setSoTimeout(this.timeout);
//s.setMaxRetries(1); // Solr-Doc: No more than 1 recommended (depreciated)
s.setSoTimeout(this.timeout);
this.server.put(name, s);
return s;
@ -244,4 +245,7 @@ public class RemoteInstance implements SolrInstance {
if (this.client != null) ((org.apache.http.impl.client.DefaultHttpClient) this.client).getConnectionManager().shutdown();
}
public static int queueSizeByMemory() {
return (int) Math.max(1, MemoryControl.maxMemory() / 1024 / 1024 / 12);
}
}

@ -413,7 +413,7 @@ public class Balancer {
Latency.updateAfterSelection(crawlEntry.url(), profileEntry == null ? 0 : robotsTime);
if (delay && sleeptime > 0) {
// force a busy waiting here
// in best case, this should never happen if the balancer works propertly
// in best case, this should never happen if the balancer works properly
// this is only to protection against the worst case, where the crawler could
// behave in a DoS-manner
ConcurrentLog.info("BALANCER", "forcing crawl-delay of " + sleeptime + " milliseconds for " + crawlEntry.url().getHost() + ": " + Latency.waitingRemainingExplain(crawlEntry.url(), robots, agent) + ", domainStacks.size() = " + this.domainStacks.size() + ", domainStacksInitSize = " + this.domStackInitSize);

@ -405,7 +405,7 @@ public final class CrawlStacker {
// do double-check
if (dbocc == HarvestProcess.ERRORS) {
final CollectionConfiguration.FailDoc errorEntry = this.nextQueue.errorURL.get(urlhash);
return "double in: errors (" + errorEntry.getFailReason() + ")";
return "double in: errors (" + (errorEntry == null ? "NULL" : errorEntry.getFailReason()) + ")";
}
return "double in: " + dbocc.toString();
}
@ -422,8 +422,8 @@ public final class CrawlStacker {
}
if (dbocc == HarvestProcess.ERRORS) {
final CollectionConfiguration.FailDoc errorEntry = this.nextQueue.errorURL.get(urlhash);
if (CrawlStacker.log.isInfo()) CrawlStacker.log.info("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "', previous cause: " + errorEntry.getFailReason());
return "double in: errors (" + errorEntry.getFailReason() + "), oldDate = " + oldDate.toString();
if (CrawlStacker.log.isInfo()) CrawlStacker.log.info("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "', previous cause: " + (errorEntry == null ? "NULL" : errorEntry.getFailReason()));
return "double in: errors (" + (errorEntry == null ? "NULL" : errorEntry.getFailReason()) + "), oldDate = " + oldDate.toString();
}
if (CrawlStacker.log.isInfo()) CrawlStacker.log.info("URL '" + urlstring + "' is double registered in '" + dbocc.toString() + "'. ");
return "double in: " + dbocc.toString() + ", oldDate = " + oldDate.toString();

@ -59,7 +59,6 @@ import net.yacy.search.IndexingQueueEntry;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import net.yacy.search.index.ErrorCache;
import net.yacy.search.schema.CollectionConfiguration;
public class CrawlQueues {
@ -181,10 +180,6 @@ public class CrawlQueues {
if (u != null) {
return u;
}
CollectionConfiguration.FailDoc ee = this.errorURL.get(ASCII.String(urlhash));
if (ee != null) {
return ee.getDigestURL();
}
for (final Loader w: this.workers.values()) {
if (Base64Order.enhancedCoder.equal(w.request.url().hash(), urlhash)) {
return w.request.url();

@ -31,6 +31,7 @@ import java.net.MalformedURLException;
import java.util.EnumMap;
import net.yacy.cora.date.ISO8601Formatter;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
@ -82,7 +83,7 @@ public class YMarkMetadata {
public YMarkMetadata(final byte[] urlHash, final Segment indexSegment) {
this.document = null;
this.indexSegment = indexSegment;
this.uri = this.indexSegment.fulltext().getURL(urlHash);
this.uri = this.indexSegment.fulltext().getURL(ASCII.String(urlHash));
}
public YMarkMetadata(final Document document) {

@ -61,7 +61,15 @@ import org.eclipse.jetty.util.IO;
* proxies request, caches responses and adds urls to crawler
*/
public class ProxyHandler extends AbstractRemoteHandler implements Handler {
protected int timeout = 10000;
@Override
protected void doStart() {
super.doStart();
timeout = sb.getConfigInt("proxy.clientTimeout", 10000);
}
public static RequestHeader convertHeaderFromJetty(HttpServletRequest request) {
RequestHeader result = new RequestHeader();
Enumeration<String> headerNames = request.getHeaderNames();
@ -100,7 +108,6 @@ public class ProxyHandler extends AbstractRemoteHandler implements Handler {
setProxyHeaderForClient(request, proxyHeaders);
final HTTPClient client = new HTTPClient(ClientIdentification.yacyProxyAgent);
int timeout = 10000;
client.setTimout(timeout);
client.setHeader(proxyHeaders.entrySet());
client.setRedirecting(false);

@ -1610,7 +1610,7 @@ public final class Switchboard extends serverSwitch {
public DigestURL getURL(final byte[] urlhash) {
if (urlhash == null) return null;
if (urlhash.length == 0) return null;
final DigestURL url = this.index.fulltext().getURL(urlhash);
final DigestURL url = this.index.fulltext().getURL(ASCII.String(urlhash));
if (url != null) return url;
return this.crawlQueues.getURL(urlhash);
}

@ -166,7 +166,9 @@ public class ErrorCache {
}
if (failDoc != null) return failDoc;
try {
SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(urlhash);
final SolrDocumentList docs = this.fulltext.getDefaultConnector().getDocumentListByQuery(CollectionSchema.id + ":\"" + urlhash + "\" AND " + CollectionSchema.failtype_s.getSolrFieldName() + ":[* TO *]", 0, 1);
if (docs == null || docs.isEmpty()) return null;
SolrDocument doc = docs.get(0);
if (doc == null) return null;
return new CollectionConfiguration.FailDoc(doc);
} catch (final IOException e) {
@ -176,8 +178,13 @@ public class ErrorCache {
}
public boolean exists(final byte[] urlHash) {
String urlHashString = ASCII.String(urlHash);
try {
final SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(ASCII.String(urlHash), CollectionSchema.failreason_s.getSolrFieldName());
// first try to check if the document exists at all.
long loaddate = this.fulltext.getLoadTime(urlHashString);
if (loaddate < 0) return false;
// then load the fail reason, if exists
final SolrDocument doc = this.fulltext.getDefaultConnector().getDocumentById(urlHashString, CollectionSchema.failreason_s.getSolrFieldName());
if (doc == null) return false;
// check if the document contains a value in the field CollectionSchema.failreason_s
Object failreason = doc.getFieldValue(CollectionSchema.failreason_s.getSolrFieldName());

@ -32,7 +32,6 @@ import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
@ -56,7 +55,6 @@ import net.yacy.cora.federate.solr.instance.RemoteInstance;
import net.yacy.cora.federate.solr.instance.ShardInstance;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.sorting.ReversibleScoreMap;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.cora.sorting.WeakPriorityBlockingQueue;
import net.yacy.cora.storage.ZIPReader;
import net.yacy.cora.storage.ZIPWriter;
@ -87,7 +85,6 @@ public final class Fulltext {
private final File segmentPath;
private final File archivePath;
private Export exportthread; // will have a export thread assigned if exporter is running
private ArrayList<HostStat> statsDump;
private InstanceMirror solrInstances;
private final CollectionConfiguration collectionConfiguration;
private final WebgraphConfiguration webgraphConfiguration;
@ -98,7 +95,6 @@ public final class Fulltext {
this.segmentPath = segmentPath;
this.archivePath = archivePath;
this.exportthread = null; // will have a export thread assigned if exporter is running
this.statsDump = null;
this.solrInstances = new InstanceMirror();
this.collectionConfiguration = collectionConfiguration;
this.webgraphConfiguration = webgraphConfiguration;
@ -206,9 +202,7 @@ public final class Fulltext {
}
public void clearCaches() {
if (this.statsDump != null) this.statsDump.clear();
this.solrInstances.clearCaches();
this.statsDump = null;
}
public void clearLocalSolr() throws IOException {
@ -261,7 +255,6 @@ public final class Fulltext {
}
public void close() {
this.statsDump = null;
try {
this.solrInstances.close();
} catch (Throwable e) {}
@ -275,36 +268,6 @@ public final class Fulltext {
getDefaultConnector().commit(softCommit);
if (this.writeWebgraph) getWebgraphConnector().commit(softCommit);
}
/*
public Date getLoadDate(final String urlHash) {
if (urlHash == null) return null;
try {
SolrDocument doc = this.getDefaultConnector().getDocumentById(urlHash, CollectionSchema.load_date_dt.getSolrFieldName());
Object d = doc == null ? null : doc.getFieldValue(CollectionSchema.load_date_dt.getSolrFieldName());
if (d == null) return null;
assert d instanceof Date : "d = " + d.toString();
if (d instanceof Date) return (Date) d;
if (d instanceof Long) return new Date(((Long) d).longValue());
return null;
} catch (final IOException e) {
return null;
}
}
*/
public DigestURL getURL(final byte[] urlHash) {
if (urlHash == null || this.getDefaultConnector() == null) return null;
try {
SolrDocument doc = this.getDefaultConnector().getDocumentById(ASCII.String(urlHash), CollectionSchema.sku.getSolrFieldName());
Object u = doc == null ? null : doc.getFieldValue(CollectionSchema.sku.getSolrFieldName());
if (u == null) return null;
assert u instanceof String : "u = " + u.toString();
if (u instanceof String) return new DigestURL((String) u, urlHash);
return null;
} catch (final IOException e) {
return null;
}
}
public URIMetadataNode getMetadata(final WeakPriorityBlockingQueue.Element<WordReferenceVars> element) {
if (element == null) return null;
@ -347,7 +310,6 @@ public final class Fulltext {
} catch (final SolrException e) {
throw new IOException(e.getMessage(), e);
}
this.statsDump = null;
if (MemoryControl.shortStatus()) clearCaches();
}
@ -359,7 +321,6 @@ public final class Fulltext {
} catch (final SolrException e) {
throw new IOException(e.getMessage(), e);
}
this.statsDump = null;
if (MemoryControl.shortStatus()) clearCaches();
}
@ -371,14 +332,13 @@ public final class Fulltext {
String id = ASCII.String(idb);
try {
// because node entries are richer than metadata entries we must check if they exist to prevent that they are overwritten
SolrDocument sd = this.getDefaultConnector().getDocumentById(id);
if (sd == null || (new URIMetadataNode(sd)).isOlder(entry)) {
long date = this.getLoadTime(id);
if (date < entry.loaddate().getTime()) {
putDocument(getDefaultConfiguration().metadata2solr(entry));
}
} catch (final SolrException e) {
throw new IOException(e.getMessage(), e);
}
this.statsDump = null;
if (MemoryControl.shortStatus()) clearCaches();
}
@ -398,16 +358,6 @@ public final class Fulltext {
if (this.writeWebgraph) deleteDomainWithConstraint(this.getWebgraphConnector(), WebgraphSchema.source_host_id_s.getSolrFieldName(), hosthashes,
(freshdate == null || freshdate.after(now)) ? null :
(WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]"));
// remove the line with statistics
if (Fulltext.this.statsDump != null) {
final Iterator<HostStat> hsi = Fulltext.this.statsDump.iterator();
HostStat hs;
while (hsi.hasNext()) {
hs = hsi.next();
if (hosthashes.contains(hs.hosthash)) hsi.remove();
}
}
}
public void deleteStaleDomainNames(final Set<String> hostnames, Date freshdate) {
@ -419,16 +369,6 @@ public final class Fulltext {
if (this.writeWebgraph) deleteDomainWithConstraint(this.getWebgraphConnector(), WebgraphSchema.source_host_s.getSolrFieldName(), hostnames,
(freshdate == null || freshdate.after(now)) ? null :
(WebgraphSchema.load_date_dt.getSolrFieldName() + ":[* TO " + ISO8601Formatter.FORMATTER.format(freshdate) + "]"));
// finally remove the line with statistics
if (Fulltext.this.statsDump != null) {
final Iterator<HostStat> hsi = Fulltext.this.statsDump.iterator();
HostStat hs;
while (hsi.hasNext()) {
hs = hsi.next();
if (hostnames.contains(hs.hostname)) hsi.remove();
}
}
}
/**
@ -528,6 +468,18 @@ public final class Fulltext {
return false;
}
public DigestURL getURL(final String urlHash) {
if (urlHash == null || this.getDefaultConnector() == null) return null;
try {
SolrConnector.Metadata md = this.getDefaultConnector().getMetadata(urlHash);
if (md == null) return null;
return new DigestURL(md.url, ASCII.getBytes(urlHash));
} catch (final IOException e) {
return null;
}
}
/**
* get the load time of a resource.
* @param urlHash
@ -536,20 +488,14 @@ public final class Fulltext {
public long getLoadTime(final String urlHash) {
if (urlHash == null) return -1l;
try {
return this.getDefaultConnector().getLoadTime(urlHash);
SolrConnector.Metadata md = this.getDefaultConnector().getMetadata(urlHash);
if (md == null) return -1;
return md.date;
} catch (final Throwable e) {
ConcurrentLog.logException(e);
}
return -1l;
}
public String failReason(final String urlHash) throws IOException {
if (urlHash == null) return null;
SolrDocument doc = this.getDefaultConnector().getDocumentById(urlHash, CollectionSchema.failreason_s.getSolrFieldName());
Object reason = doc == null ? null : doc.getFieldValue(CollectionSchema.failreason_s.getSolrFieldName());
if (reason == null) return null;
return reason instanceof String && ((String) reason).length() == 0 ? null : (String) reason;
}
public List<File> dumpFiles() {
EmbeddedInstance esc = this.solrInstances.getEmbedded();
@ -790,42 +736,5 @@ public final class Fulltext {
}
}
public Iterator<HostStat> statistics(int count, final ScoreMap<String> domainScore) {
// prevent too heavy IO.
if (this.statsDump != null && count <= this.statsDump.size()) return this.statsDump.iterator();
// fetch urls from the database to determine the host in clear text
final Iterator<String> j = domainScore.keys(false); // iterate urlhash-examples in reverse order (biggest first)
String urlhash;
count += 10; // make some more to prevent that we have to do this again after deletions too soon.
if (count < 0 || domainScore.sizeSmaller(count)) count = domainScore.size();
this.statsDump = new ArrayList<HostStat>();
DigestURL url;
while (j.hasNext()) {
urlhash = j.next();
if (urlhash == null) continue;
url = this.getURL(ASCII.getBytes(urlhash));
if (url == null || url.getHost() == null) continue;
if (this.statsDump == null) return new ArrayList<HostStat>().iterator(); // some other operation has destroyed the object
this.statsDump.add(new HostStat(url.getHost(), url.getPort(), urlhash.substring(6), domainScore.get(urlhash)));
count--;
if (count == 0) break;
}
// finally return an iterator for the result array
return (this.statsDump == null) ? new ArrayList<HostStat>().iterator() : this.statsDump.iterator();
}
public static class HostStat {
public String hostname, hosthash;
public int port;
public int count;
private HostStat(final String host, final int port, final String urlhashfragment, final int count) {
assert urlhashfragment.length() == 6;
this.hostname = host;
this.port = port;
this.hosthash = urlhashfragment;
this.count = count;
}
}
}

@ -803,7 +803,7 @@ public class Segment {
if (urlhash == null) return 0;
// determine the url string
final DigestURL url = fulltext().getURL(urlhash);
final DigestURL url = fulltext().getURL(ASCII.String(urlhash));
if (url == null) return 0;
try {

@ -29,13 +29,8 @@ import java.io.File;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.sorting.OrderedScoreMap;
import net.yacy.cora.sorting.ScoreMap;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.kelondro.index.BinSearch;
@ -47,10 +42,8 @@ import net.yacy.peers.Seed;
import net.yacy.peers.SeedDB;
import net.yacy.peers.graphics.WebStructureGraph;
import net.yacy.peers.graphics.WebStructureGraph.HostReference;
import net.yacy.search.index.Fulltext.HostStat;
import net.yacy.search.index.Segment;
public class BlockRank {
/**
@ -149,64 +142,6 @@ public class BlockRank {
return index;
}
public static BinSearch[] evaluate(final ReferenceContainerCache<HostReference> index, final Map<String, HostStat> hostHashResolver, final BinSearch[] referenceTable, int recusions) {
// first find out the maximum count of the hostHashResolver
int maxHostCount = 1;
for (final HostStat stat: hostHashResolver.values()) {
if (stat.count > maxHostCount) maxHostCount = stat.count;
}
// then just count the number of references. all other information from the index is not used because they cannot be trusted
final ScoreMap<byte[]> hostScore = new OrderedScoreMap<byte[]>(index.termKeyOrdering());
HostStat hostStat;
int hostCount;
for (final ReferenceContainer<HostReference> container: index) {
if (container.isEmpty()) continue;
if (referenceTable == null) {
hostStat = hostHashResolver.get(ASCII.String(container.getTermHash()));
hostCount = hostStat == null ? 6 /* high = a penalty for 'i do not know this', this may not be fair*/ : Math.max(1, hostStat.count);
hostScore.set(container.getTermHash(), container.size() * maxHostCount / hostCount);
} else {
int score = 0;
final Iterator<HostReference> hri = container.entries();
HostReference hr;
while (hri.hasNext()) {
hr = hri.next();
hostStat = hostHashResolver.get(ASCII.String(hr.urlhash()));
hostCount = hostStat == null ? 6 /* high = a penalty for 'i do not know this', this may not be fair*/ : Math.max(1, hostStat.count);
score += (17 - ranking(hr.urlhash(), referenceTable)) * maxHostCount / hostCount;
}
hostScore.set(container.getTermHash(), score);
}
}
// now divide the scores into two halves until the score map is empty
final List<BinSearch> table = new ArrayList<BinSearch>();
while (hostScore.size() > 10) {
final List<byte[]> smallest = hostScore.lowerHalf();
if (smallest.isEmpty()) break; // should never happen but this ensures termination of the loop
ConcurrentLog.info("BlockRank", "index evaluation: computed partition of size " + smallest.size());
table.add(new BinSearch(smallest, 6));
for (final byte[] host: smallest) hostScore.delete(host);
}
if (!hostScore.isEmpty()) {
final ArrayList<byte[]> list = new ArrayList<byte[]>();
for (final byte[] entry: hostScore) list.add(entry);
ConcurrentLog.info("BlockRank", "index evaluation: computed last partition of size " + list.size());
table.add(new BinSearch(list, 6));
}
// the last table entry has now a list of host hashes that has the most references
final int binTables = Math.min(16, table.size());
final BinSearch[] newTables = new BinSearch[binTables];
for (int i = 0; i < binTables; i++) newTables[i] = table.get(table.size() - i - 1);
// re-use the new table for a recursion
if (recusions == 0) return newTables;
return evaluate(index, hostHashResolver, newTables, --recusions); // one recursion step
}
public static int ranking(final byte[] hash, final BinSearch[] rankingTable) {
if (rankingTable == null) return 16;
byte[] hosthash;

@ -56,6 +56,7 @@ import net.yacy.cora.federate.solr.ProcessType;
import net.yacy.cora.federate.solr.SchemaDeclaration;
import net.yacy.cora.federate.solr.connector.AbstractSolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector;
import net.yacy.cora.federate.solr.connector.SolrConnector.Metadata;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
@ -1248,8 +1249,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
for (Map.Entry<byte[], CRV> entry: rm.entrySet()) {
if (entry == null || entry.getValue() == null) continue;
try {
String url = (String) connector.getDocumentById(ASCII.String(entry.getKey()), CollectionSchema.sku.getSolrFieldName()).getFieldValue(CollectionSchema.sku.getSolrFieldName());
ConcurrentLog.info("CollectionConfiguration", "CR for " + url);
Metadata md = connector.getMetadata(ASCII.String(entry.getKey()));
ConcurrentLog.info("CollectionConfiguration", "CR for " + md.url);
ConcurrentLog.info("CollectionConfiguration", ">> " + entry.getValue().toString());
} catch (final IOException e) {
ConcurrentLog.logException(e);

Loading…
Cancel
Save