Merge remote branch 'origin/master'

pull/1/head
sof 13 years ago
commit 5cb244b79b

@ -39,7 +39,7 @@
<classpathentry kind="lib" path="lib/jcl-over-slf4j-1.6.1.jar"/>
<classpathentry kind="lib" path="lib/wstx-asl-3.2.7.jar"/>
<classpathentry kind="lib" path="lib/commons-codec-1.6.jar"/>
<classpathentry kind="lib" path="lib/httpcore-4.2.1.jar" sourcepath="/Volumes/Raptor/Data/sourcecode/httpcore/src/main/java"/>
<classpathentry kind="lib" path="lib/httpcore-4.2.2.jar" sourcepath="/Volumes/Raptor/Data/sourcecode/httpcore/src/main/java"/>
<classpathentry kind="lib" path="lib/httpclient-4.2.1.jar" sourcepath="/Volumes/Raptor/Data/sourcecode/httpclient/src/main/java"/>
<classpathentry kind="lib" path="lib/httpmime-4.2.1.jar"/>
<classpathentry kind="lib" path="lib/commons-io-2.1.jar"/>

@ -58,7 +58,7 @@
<string>$JAVAROOT/lib/guava-r05.jar</string>
<string>$JAVAROOT/lib/htmllexer.jar</string>
<string>$JAVAROOT/lib/httpclient-4.2.1.jar</string>
<string>$JAVAROOT/lib/httpcore-4.2.1.jar</string>
<string>$JAVAROOT/lib/httpcore-4.2.2.jar</string>
<string>$JAVAROOT/lib/httpmime-4.2.1.jar</string>
<string>$JAVAROOT/lib/icu4j-core.jar</string>
<string>$JAVAROOT/lib/iri-0.8.jar</string>

@ -177,7 +177,7 @@
<pathelement location="${lib}/guava-r05.jar" />
<pathelement location="${lib}/htmllexer.jar" />
<pathelement location="${lib}/httpclient-4.2.1.jar" />
<pathelement location="${lib}/httpcore-4.2.1.jar" />
<pathelement location="${lib}/httpcore-4.2.2.jar" />
<pathelement location="${lib}/httpmime-4.2.1.jar" />
<pathelement location="${lib}/icu4j-core.jar" />
<pathelement location="${lib}/iri-0.8.jar" />

@ -119,6 +119,9 @@ responsetime_i
## all visible text, text
text_t
## additional synonyms to the words in the text
synonyms_sxt
## h1 header
h1_txt
@ -282,6 +285,15 @@ italic_txt
## total number of occurrences of <i>, int
#italiccount_i
## all texts inside of <u> tags. no doubles. listed in the order of number of occurrences in decreasing order
underline_txt
## number of occurrences of texts in underline_txt
#underline_val
## total number of occurrences of <u>, int
#underlinecount_i
## flag that shows if a swf file is linked, boolean
#flash_b

@ -1114,3 +1114,7 @@ interaction.dontimportbookmarks =
interaction.autocrawler.enabled = false
interaction.autocrawler.domainfilter = .*
interaction.autocrawler.categoryfilter = .*
# host browser settings
browser.autoload = true
browser.load4everyone = false

@ -23,6 +23,7 @@ import java.io.OutputStream;
import java.io.OutputStreamWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Map;
import java.util.regex.Pattern;
@ -35,6 +36,7 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.kelondro.logging.Log;
import net.yacy.search.Switchboard;
import net.yacy.search.query.AccessTracker;
import net.yacy.search.query.QueryParams;
import net.yacy.search.query.SnippetProcess;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@ -105,10 +107,11 @@ public class searchresult {
//post.put(, post.remove("client"));//required, example: myfrontend
//post.put(, post.remove("output"));//required, example: xml,xml_no_dtd
String q = post.get(CommonParams.Q, "");
post.put("originalQuery", q);
post.put(CommonParams.ROWS, post.remove("num"));
post.put(CommonParams.ROWS, Math.min(post.getInt(CommonParams.ROWS, 10), (authenticated) ? 5000 : 100));
post.put("hl", "true");
post.put("hl.fl", YaCySchema.h1_txt.name() + ","+ YaCySchema.h1_txt.name() + ","+ YaCySchema.text_t.name());
post.put("hl.fl", YaCySchema.h1_txt.name() + "," + YaCySchema.h2_txt.name() + "," + YaCySchema.text_t.name());
post.put("hl.alternateField", YaCySchema.description.name());
post.put("hl.simple.pre", "<b>");
post.put("hl.simple.post", "</b>");
@ -124,6 +127,10 @@ public class searchresult {
String access = post.remove("access");
String entqr = post.remove("entqr");
// get a solr query string
Collection<String>[] cq = QueryParams.cleanQuery(q);
q = QueryParams.solrQueryString(cq[0], cq[1], sb.index.fulltext().getSolrScheme()).toString();
// add sites operator
if (site != null && site.length() > 0) {
String[] s0 = site.split(Pattern.quote("|"));
@ -143,7 +150,8 @@ public class searchresult {
}
post.put(CommonParams.Q, q);
}
post.put(CommonParams.Q, q);
// get the embedded connector
EmbeddedSolrConnector connector = (EmbeddedSolrConnector) sb.index.fulltext().getLocalSolr();
if (connector == null) return null;

@ -23,7 +23,7 @@ commons-lang-2.6.jar
geronimo-stax-api_1.0_spec-1.0.1.jar
guava-r05.jar
httpclient-4.2.1.jar
httpcore-4.2.1.jar
httpcore-4.2.2.jar
jcl-over-slf4j-1.6.1.jar
log4j-over-slf4j-1.6.1.jar
lucene-analyzers-3.6.0.jar

@ -173,10 +173,4 @@
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
This project contains annotations derived from JCIP-ANNOTATIONS
Copyright (c) 2005 Brian Goetz and Tim Peierls.
See http://www.jcip.net and the Creative Commons Attribution License
(http://creativecommons.org/licenses/by/2.5)
END OF TERMS AND CONDITIONS

@ -450,9 +450,15 @@ The search result was discovered by a heuristic, not previously known by YaCy==D
When a search is made using a \'site\'-operator \(like: \'download site:yacy.net\'\) then the host of the site-operator is instantly crawled with a host-restricted depth-1 crawl.==Wenn eine Suche mit dem 'site'-Operator gestartet wird (z.B.: 'download site:yacy.net') dann wird der Host des 'site'-Operator sofort gecrawlt mit einer auf den Host beschränkten Suchtiefe von 1.
That means: right after the search request the portal page of the host is loaded and every page that is linked on this page that points to a page on the same host.==Das bedeutet: Gleich nach der Suchanfrage wird die Portalseite des Hosts geladen und jede verlinkte Seite die auf eine Seite auf demselben Host verweist.
Because this \'instant crawl\' must obey the robots.txt and a minimum access time for two consecutive pages, this heuristic is rather slow, but may discover all wanted search results using a second search \(after a small pause of some seconds\).==Weil dieser 'Sofort Crawl' auch die robots.txt und eine minimale Zugriffszeit für folgende Seiten berücksichtigen muss, ist diese Heuristik sehr langsam - aber kann alle gewünschten Suchergebniss finden indem eine zweite Suche (nach einigen Sekunden Pause) gestartet wird.
scroogle: load external search result list==scroogle: externe Suchergebnis Listen laden
When using this heuristic, then every search request line is used for a call to scroogle.==Diese Heuristik verwendet jede Zeile der Suchanfrage für einen Aufruf von Scroogle.
20 results are taken from scroogle and loaded simultanously, parsed and indexed immediately.==20 Ergebnisse werden von Scroogle geholt und simultan geladen, geparst und sofort indexiert.
search-result: shallow crawl on all displayed search results==Suchergebnis: crawl Links aller angezeigten Suchergebnisse
When a search is made then all displayed result links are crawled with a depth-1 crawl.==Nach einer Suche werden alle angezeigten Ergebnislinks der Crawler Liste (mit einer Suchtiefe von 1) hinzugefügt.
This means: right after the search request every page is loaded and every page that is linked on this page.==Das bedeutet: direkt nach der Suche wird jeder Link auf den Ergebnisseiten der Suche indexiert.
If you check \'add as global crawl job\' the pages to be crawled are added to the global crawl queue \(remote peers can pickup pages to be crawled\).==Wenn 'als globaler Crawl hinzufügen' gewählt ist werden die zu indexierenden Seiten dem globalen Crawler hinzugefügt (entfernte Peers können beim Crawlen unterstützen).
Default is to add the links to the local crawl queue \(your peer crawls the linked pages\).==Vorgabe ist die Links der lokalen Crawl Queue hinzuzufügen.
add as global crawl job==als globaler Crawl hinzufügen
blekko: load external search result list from==blekko: lade externe Suchergebnisse von
When using this heuristic, then every search request line is used for a call to blekko.==Wenn diese Heuristik aktiv ist werden alle lokalen Suchanfragen an blekko weitergeleitet.
20 results are taken from blekko and loaded simultanously, parsed and indexed immediately.==Die ersten 20 Ergebnisse von blekko werden geladen und sofort indexiert.
#-----------------------------
#File: ConfigHTCache_p.html

@ -77,7 +77,7 @@
<compilation-unit>
<package-root>source</package-root>
<package-root>htroot</package-root>
<classpath mode="compile">lib/activation.jar;lib/apache-mime4j-0.6.jar;lib/apache-solr-core-3.6.0.jar;lib/apache-solr-solrj-3.6.0.jar;lib/arq-2.8.7.jar;lib/bcmail-jdk15-145.jar;lib/bcprov-jdk15-145.jar;lib/commons-codec-1.6.jar;lib/commons-compress-1.4.1.jar;lib/commons-fileupload-1.2.2.jar;lib/commons-httpclient-3.1.jar;lib/commons-io-2.1.jar;lib/commons-jxpath-1.3.jar;lib/commons-lang-2.6.jar;lib/commons-logging-1.1.1.jar;lib/fontbox-1.7.0.jar;lib/geronimo-stax-api_1.0_spec-1.0.1.jar;lib/guava-r05.jar;lib/htmllexer.jar;lib/htmlparser.jar;lib/httpclient-4.2.1.jar;lib/httpcore-4.2.1.jar;lib/httpmime-4.2.1.jar;lib/icu4j-core.jar;lib/iri-0.8.jar;lib/J7Zip-modified.jar;lib/jakarta-oro-2.0.8.jar;lib/jcifs-1.3.15.jar;lib/jcl-over-slf4j-1.6.1.jar;lib/jempbox-1.7.0.jar;lib/jena-2.6.4.jar;lib/jetty-6.1.26-patched-JETTY-1340.jar;lib/jetty-util-6.1.26-patched-JETTY-1340.jar;lib/jsch-0.1.42.jar;lib/json-simple-1.1.jar;lib/log4j-1.2.16.jar;lib/log4j-over-slf4j-1.6.1.jar;lib/lucene-analyzers-3.6.0.jar;lib/lucene-core-3.6.0.jar;lib/lucene-highlighter-3.6.0.jar;lib/lucene-phonetic-3.6.0.jar;lib/lucene-spatial-3.6.0.jar;lib/lucene-spellchecker-3.6.0.jar;lib/metadata-extractor-2.4.0-beta-1.jar;lib/mysql-connector-java-5.1.12-bin.jar;lib/pdfbox-1.7.0.jar;lib/poi-3.6-20091214.jar;lib/poi-scratchpad-3.6-20091214.jar;lib/sax-2.0.1.jar;lib/servlet-api-2.5-20081211.jar;lib/slf4j-api-1.6.1.jar;lib/slf4j-jdk14-1.6.1.jar;lib/webcat-0.1-swf.jar;lib/wstx-asl-3.2.7.jar;lib/xercesImpl.jar;lib/xml-apis.jar</classpath>
<classpath mode="compile">lib/activation.jar;lib/apache-mime4j-0.6.jar;lib/apache-solr-core-3.6.1.jar;lib/apache-solr-solrj-3.6.1.jar;lib/arq-2.8.7.jar;lib/bcmail-jdk15-145.jar;lib/bcprov-jdk15-145.jar;lib/commons-codec-1.6.jar;lib/commons-compress-1.4.1.jar;lib/commons-fileupload-1.2.2.jar;lib/commons-httpclient-3.1.jar;lib/commons-io-2.1.jar;lib/commons-jxpath-1.3.jar;lib/commons-lang-2.6.jar;lib/commons-logging-1.1.1.jar;lib/fontbox-1.7.0.jar;lib/geronimo-stax-api_1.0_spec-1.0.1.jar;lib/guava-r05.jar;lib/htmllexer.jar;lib/htmlparser.jar;lib/httpclient-4.2.1.jar;lib/httpcore-4.2.2.jar;lib/httpmime-4.2.1.jar;lib/icu4j-core.jar;lib/iri-0.8.jar;lib/J7Zip-modified.jar;lib/jakarta-oro-2.0.8.jar;lib/jcifs-1.3.15.jar;lib/jcl-over-slf4j-1.6.1.jar;lib/jempbox-1.7.0.jar;lib/jena-2.6.4.jar;lib/jetty-6.1.26-patched-JETTY-1340.jar;lib/jetty-util-6.1.26-patched-JETTY-1340.jar;lib/jsch-0.1.42.jar;lib/json-simple-1.1.jar;lib/jsoup-1.6.3.jar;lib/log4j-1.2.16.jar;lib/log4j-over-slf4j-1.6.1.jar;lib/lucene-analyzers-3.6.1.jar;lib/lucene-core-3.6.1.jar;lib/lucene-highlighter-3.6.1.jar;lib/lucene-phonetic-3.6.1.jar;lib/lucene-spatial-3.6.1.jar;lib/lucene-spellchecker-3.6.1.jar;lib/metadata-extractor-2.4.0-beta-1.jar;lib/mysql-connector-java-5.1.12-bin.jar;lib/pdfbox-1.7.0.jar;lib/poi-3.6-20091214.jar;lib/poi-scratchpad-3.6-20091214.jar;lib/sax-2.0.1.jar;lib/servlet-api-2.5-20081211.jar;lib/slf4j-api-1.6.1.jar;lib/slf4j-jdk14-1.6.1.jar;lib/webcat-0.1-swf.jar;lib/wstx-asl-3.2.7.jar;lib/xercesImpl.jar;lib/xml-apis.jar</classpath>
<source-level>1.6</source-level>
</compilation-unit>
</java-data>

@ -67,6 +67,7 @@ public enum YaCySchema implements Schema {
imagescount_i(SolrType.integer, true, true, false, "number of images"),
responsetime_i(SolrType.integer, true, true, false, "response time of target server in milliseconds"),
text_t(SolrType.text_general, true, true, false, "all visible text"),
synonyms_sxt(SolrType.string, true, true, true, "additional synonyms to the words in the text"),
h1_txt(SolrType.text_general, true, true, true, "h1 header"),
h2_txt(SolrType.text_general, true, true, true, "h2 header"),
h3_txt(SolrType.text_general, true, true, true, "h3 header"),
@ -127,6 +128,8 @@ public enum YaCySchema implements Schema {
boldcount_i(SolrType.integer, true, true, false, "total number of occurrences of <b> or <strong>"),
italic_txt(SolrType.text_general, true, true, true, "all texts inside of <i> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
italiccount_i(SolrType.integer, true, true, false, "total number of occurrences of <i>"),
underline_txt(SolrType.text_general, true, true, true, "all texts inside of <u> tags. no doubles. listed in the order of number of occurrences in decreasing order"),
underlinecount_i(SolrType.integer, true, true, false, "total number of occurrences of <u>"),
flash_b(SolrType.bool, true, true, false, "flag that shows if a swf file is linked"),
frames_txt(SolrType.text_general, true, true, true, "list of all links to frames"),
framesscount_i(SolrType.integer, true, true, false, "number of frames_txt"),
@ -165,6 +168,7 @@ public enum YaCySchema implements Schema {
// special values; can only be used if '_val' type is defined in schema file; this is not standard
bold_val(SolrType.integer, true, true, true, "number of occurrences of texts in bold_txt"),
italic_val(SolrType.integer, true, true, true, "number of occurrences of texts in italic_txt"),
underline_val(SolrType.integer, true, true, true, "number of occurrences of texts in underline_txt"),
ext_cms_txt(SolrType.text_general, true, true, true, "names of cms attributes; if several are recognized then they are listen in decreasing order of number of matching criterias"),
ext_cms_val(SolrType.integer, true, true, true, "number of attributes that count for a specific cms in ext_cms_txt"),
ext_ads_txt(SolrType.text_general, true, true, true, "names of ad-servers/ad-services"),

@ -161,7 +161,7 @@ public class GSAResponseWriter implements QueryResponseWriter {
// write header
writer.write(XML_START);
String query = request.getParams().get("q");
String query = request.getParams().get("originalQuery");
String site = (String) context.get("site");
OpensearchResponseWriter.solitaireTag(writer, "TM", Long.toString(System.currentTimeMillis() - start));
OpensearchResponseWriter.solitaireTag(writer, "Q", query);
@ -170,7 +170,7 @@ public class GSAResponseWriter implements QueryResponseWriter {
paramTag(writer, "ie", "UTF-8");
paramTag(writer, "oe", "UTF-8");
paramTag(writer, "client", (String) context.get("client"));
paramTag(writer, "q", request.getParams().get("q"));
paramTag(writer, "q", query);
paramTag(writer, "site", site);
paramTag(writer, "start", Integer.toString(resHead.offset));
paramTag(writer, "num", Integer.toString(resHead.rows));

@ -0,0 +1,112 @@
/**
* Stemming
* Copyright 2012 by Michael Peter Christen; mc@yacy.net, Frankfurt a. M., Germany
* first published 01.10.2012 on http://yacy.net
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.cora.language.synonyms;
import java.io.File;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.concurrent.BlockingQueue;
import org.apache.log4j.Logger;
import net.yacy.cora.storage.Files;
/**
* Stemming library: reads stemming files and creates a mapping from words to synonyms
* Stemming files must have a list of synonym words in each line of the input file.
* The words within one line must be separated by ','. Lines starting with '#' are
* comment files and are ignored. Each line can (but does not need to) have a '{'
* at the beginning of the line and '}' at the end (which would be the GSA format).
*/
public class SynonymLibrary {
Logger log = Logger.getLogger(SynonymLibrary.class);
private Map<String, List<Set<String>>> lib;
public SynonymLibrary(final File path) {
this.lib = new HashMap<String, List<Set<String>>>();
if (!path.exists() || !path.isDirectory()) return;
final String[] files = path.list();
for (final String f: files) {
File ff = new File(path, f);
String line;
try {
BlockingQueue<String> list = Files.concurentLineReader(ff, 1000);
while ((line = list.take()) != Files.POISON_LINE) {
line = line.trim();
if (line.length() == 0 || line.charAt(0) == '#') continue;
if (line.charAt(line.length() - 1) == '}') line = line.substring(0, line.length() - 1);
if (line.charAt(0) == '{') line = line.substring(1);
String[] words = line.split(",");
Set<String> synonyms = new HashSet<String>();
Set<String> keys = new HashSet<String>();
for (String s: words) {
s = s.trim();
if (s.length() < 2) continue;
String t = s.toLowerCase();
synonyms.add(t);
keys.add(t.substring(0, 2));
}
for (String key: keys) {
List<Set<String>> symsetlist = this.lib.get(key);
if (symsetlist == null) {
symsetlist = new ArrayList<Set<String>>();
this.lib.put(key, symsetlist);
}
symsetlist.add(synonyms);
}
}
} catch (Throwable e) {
log.warn("cannot read stemming file " + f, e);
}
}
}
/**
* for a given word, return a list of synonym words
* @param word
* @return a list of synonyms bot without the requested word
*/
public Set<String> getSynonyms(String word) {
word = word.toLowerCase();
if (word.length() < 2) return null;
String key = word.substring(0, 2);
List<Set<String>> symsetlist = this.lib.get(key);
if (symsetlist == null) return null;
for (Set<String> symset: symsetlist) {
if (symset.contains(word)) {
// create a new set containing all but the one word
Set<String> returnSet = new HashSet<String>();
for (String synonym: symset) {
if (synonym.equals(word)) continue;
returnSet.add(synonym);
}
return returnSet;
}
}
return null;
}
}

@ -85,7 +85,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
}
//get words from document
final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib, false).words();
final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, false).words();
// generate potential tags from document title, description and subject
final int bufferSize = document.dc_title().length() + document.dc_description().length() + document.dc_subject(' ').length() + 32;

@ -24,9 +24,12 @@ import java.io.File;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.Properties;
@ -38,6 +41,7 @@ import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.WordCache;
import net.yacy.cora.document.Classification.ContentDomain;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.language.synonyms.SynonymLibrary;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.document.language.Identificator;
import net.yacy.document.parser.html.ImageEntry;
@ -66,7 +70,8 @@ public final class Condenser {
//private Properties analysis;
private final Map<String, Word> words; // a string (the words) to (indexWord) - relation
private final Map<String, Set<Tagging.Metatag>> tags = new HashMap<String, Set<Tagging.Metatag>>(); // a set of tags, discovered from Autotagging
private final Set<String> synonyms; // a set of synonyms to the words
public int RESULT_NUMB_WORDS = -1;
public int RESULT_DIFF_WORDS = -1;
public int RESULT_NUMB_SENTENCES = -1;
@ -79,12 +84,14 @@ public final class Condenser {
final boolean indexText,
final boolean indexMedia,
final WordCache meaningLib,
final SynonymLibrary synonyms,
final boolean doAutotagging
) {
Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging
// if addMedia == true, then all the media links are also parsed and added to the words
// added media words are flagged with the appropriate media flag
this.words = new HashMap<String, Word>();
this.synonyms = new LinkedHashSet<String>();
this.RESULT_FLAGS = new Bitfield(4);
// construct flag set for document
@ -202,6 +209,14 @@ public final class Condenser {
if (!this.tags.isEmpty()) {
document.addMetatags(this.tags);
}
// create the synonyms set
if (synonyms != null) {
for (String word: this.words.keySet()) {
Set<String> syms = synonyms.getSynonyms(word);
if (syms != null) this.synonyms.addAll(syms);
}
}
}
private void insertTextToWords(
@ -239,6 +254,7 @@ public final class Condenser {
this.languageIdentificator = null; // we don't need that here
// analysis = new Properties();
this.words = new TreeMap<String, Word>();
this.synonyms = new HashSet<String>();
createCondensement(text, meaningLib, doAutotagging);
}
@ -254,6 +270,12 @@ public final class Condenser {
// returns the words as word/indexWord relation map
return this.words;
}
public List<String> synonyms() {
ArrayList<String> l = new ArrayList<String>(this.synonyms.size());
for (String s: this.synonyms) l.add(s);
return l;
}
public String language() {
return this.languageIdentificator.getLanguage();

@ -47,6 +47,7 @@ import net.yacy.cora.document.WordCache;
import net.yacy.cora.geo.GeonamesLocation;
import net.yacy.cora.geo.OpenGeoDBLocation;
import net.yacy.cora.geo.OverarchingLocation;
import net.yacy.cora.language.synonyms.SynonymLibrary;
import net.yacy.cora.lod.JenaTripleStore;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.lod.vocabulary.Tagging.SOTuple;
@ -61,19 +62,19 @@ public class LibraryProvider {
public static final String path_to_source_dictionaries = "source";
public static final String path_to_did_you_mean_dictionaries = "didyoumean";
public static final String path_to_autotagging_dictionaries = "autotagging";
public static final String path_to_synonym_dictionaries = "synonyms";
public static final String disabledExtension = ".disabled";
public static WordCache dymLib = new WordCache(null);
public static Autotagging autotagging = null;
public static SynonymLibrary synonyms = null;
public static OverarchingLocation geoLoc = new OverarchingLocation();
private static File dictSource = null;
private static File dictRoot = null;
public static enum Dictionary {
GEODB0(
"geo0",
"http://downloads.sourceforge.net/project/opengeodb/Data/0.2.5a/opengeodb-0.2.5a-UTF8-sql.gz" ),
GEODB0( "geo0", "http://downloads.sourceforge.net/project/opengeodb/Data/0.2.5a/opengeodb-0.2.5a-UTF8-sql.gz" ),
GEODB1( "geo1", "http://fa-technik.adfc.de/code/opengeodb/dump/opengeodb-02624_2011-10-17.sql.gz" ),
GEON0( "geon0", "http://download.geonames.org/export/dump/cities1000.zip" ),
GEON1( "geon1", "http://download.geonames.org/export/dump/cities5000.zip" ),
@ -121,6 +122,7 @@ public class LibraryProvider {
initAutotagging();
activateDeReWo();
initDidYouMean();
initSynonyms();
integrateOpenGeoDB();
integrateGeonames0(-1);
integrateGeonames1(-1);
@ -169,8 +171,7 @@ public class LibraryProvider {
geoLoc.activateLocation(Dictionary.GEON2.nickname, new GeonamesLocation(geon, dymLib, minPopulation));
return;
}
}
}
public static void initDidYouMean() {
final File dymDict = new File(dictRoot, path_to_did_you_mean_dictionaries);
if ( !dymDict.exists() ) {
@ -186,7 +187,13 @@ public class LibraryProvider {
}
autotagging = new Autotagging(autotaggingPath);
}
public static void initSynonyms() {
final File synonymPath = new File(dictRoot, path_to_synonym_dictionaries);
if ( !synonymPath.exists() ) {
synonymPath.mkdirs();
}
synonyms = new SynonymLibrary(synonymPath);
}
public static void activateDeReWo() {
// translate input files (once..)
final File dymDict = new File(dictRoot, path_to_did_you_mean_dictionaries);

@ -100,6 +100,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
title(TagType.pair),
b(TagType.pair),
strong(TagType.pair),
u(TagType.pair),
i(TagType.pair),
li(TagType.pair),
script(TagType.pair),
@ -130,7 +131,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
private Collection<String> titles;
//private String headline;
private List<String>[] headlines;
private final ClusteredScoreMap<String> bold, italic;
private final ClusteredScoreMap<String> bold, italic, underline;
private final List<String> li;
private final CharBuffer content;
private final EventListenerList htmlFilterEventListeners;
@ -177,6 +178,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
for (int i = 0; i < this.headlines.length; i++) this.headlines[i] = new ArrayList<String>();
this.bold = new ClusteredScoreMap<String>();
this.italic = new ClusteredScoreMap<String>();
this.underline = new ClusteredScoreMap<String>();
this.li = new ArrayList<String>();
this.content = new CharBuffer(MAX_DOCSIZE, 1024);
this.htmlFilterEventListeners = new EventListenerList();
@ -494,6 +496,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} else if ((tagname.equalsIgnoreCase("i")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) this.italic.inc(h);
} else if ((tagname.equalsIgnoreCase("u")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) this.underline.inc(h);
} else if ((tagname.equalsIgnoreCase("li")) && (text.length < 1024)) {
h = recursiveParse(text);
if (h.length() > 0) this.li.add(h);
@ -609,6 +614,19 @@ public class ContentScraper extends AbstractScraper implements Scraper {
return counter;
}
public String[] getUnderline() {
final List<String> a = new ArrayList<String>();
final Iterator<String> i = this.underline.keys(false);
while (i.hasNext()) a.add(i.next());
return a.toArray(new String[a.size()]);
}
public String[] getUnderlineCount(final String[] a) {
final String[] counter = new String[a.length];
for (int i = 0; i < a.length; i++) counter[i] = Integer.toString(this.underline.get(a[i]));
return counter;
}
public String[] getLi() {
return this.li.toArray(new String[this.li.size()]);
}

@ -118,7 +118,7 @@ public class torrentParser extends AbstractParser implements Parser {
byte[] b = FileUtils.read(new File(args[0]));
torrentParser parser = new torrentParser();
Document[] d = parser.parse(new DigestURI("http://localhost/test.torrent"), null, "UTF-8", new ByteArrayInputStream(b));
Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib, false);
Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, false);
Map<String, Word> w = c.words();
for (Map.Entry<String, Word> e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText);
} catch (IOException e) {

@ -2489,9 +2489,10 @@ public final class Switchboard extends serverSwitch
final Condenser[] condenser = new Condenser[in.documents.length];
for ( int i = 0; i < in.documents.length; i++ ) {
condenser[i] =
new Condenser(in.documents[i], in.queueEntry.profile().indexText(), in.queueEntry
.profile()
.indexMedia(), LibraryProvider.dymLib, true);
new Condenser(
in.documents[i], in.queueEntry.profile().indexText(),
in.queueEntry.profile().indexMedia(),
LibraryProvider.dymLib, LibraryProvider.synonyms, true);
// update image result list statistics
// its good to do this concurrently here, because it needs a DNS lookup
@ -2739,7 +2740,7 @@ public final class Switchboard extends serverSwitch
if ( document.indexingDenied() ) {
throw new Parser.Failure("indexing is denied", url);
}
final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, true);
final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, true);
ResultImages.registerImages(url, document, true);
Switchboard.this.webStructure.generateCitationReference(url, document);
storeDocumentIndex(

@ -164,7 +164,7 @@ public class DocumentIndex extends Segment {
int c = 0;
for ( final Document document : documents ) {
if (document == null) continue;
final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, true);
final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, true);
rows[c++] =
super.storeDocument(
url,

@ -357,6 +357,14 @@ public final class Fulltext implements Iterable<byte[]> {
return false;
}
public String failReason(final String urlHash) throws IOException {
if (urlHash == null) return null;
SolrDocument doc = this.solr.get(urlHash);
if (doc == null) return null;
String reason = (String) doc.getFieldValue(YaCySchema.failreason_t.name());
return reason == null ? null : reason.length() == 0 ? null : reason;
}
@Override
public Iterator<byte[]> iterator() {
CloneableIterator<byte[]> a = null;

@ -395,7 +395,7 @@ public class Segment {
// STORE TO SOLR
try {
this.fulltext.putDocument(this.fulltext.getSolrScheme().yacy2solr(id, profile, responseHeader, document, metadata));
this.fulltext.putDocument(this.fulltext.getSolrScheme().yacy2solr(id, profile, responseHeader, document, condenser, metadata));
} catch ( final IOException e ) {
Log.logWarning("SOLR", "failed to send " + urlNormalform + " to solr: " + e.getMessage());
}
@ -517,7 +517,7 @@ public class Segment {
}
// get the word set
Set<String> words = null;
words = new Condenser(document, true, true, null, false).words().keySet();
words = new Condenser(document, true, true, null, null, false).words().keySet();
// delete all word references
int count = 0;

@ -338,7 +338,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (!text.isEmpty() && text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.');
}
public SolrInputDocument yacy2solr(final String id, final CrawlProfile profile, final ResponseHeader header, final Document yacydoc, final URIMetadata metadata) {
public SolrInputDocument yacy2solr(final String id, final CrawlProfile profile, final ResponseHeader header, final Document yacydoc, Condenser condenser, final URIMetadata metadata) {
// we use the SolrCell design as index scheme
final SolrInputDocument doc = new SolrInputDocument();
final DigestURI digestURI = new DigestURI(yacydoc.dc_source());
@ -416,6 +416,10 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
final int contentwc = content.split(" ").length;
add(doc, YaCySchema.wordcount_i, contentwc);
}
if (allAttr || contains(YaCySchema.synonyms_sxt)) {
List<String> synonyms = condenser.synonyms();
add(doc, YaCySchema.synonyms_sxt, synonyms);
}
// path elements of link
if (allAttr || contains(YaCySchema.url_paths_sxt)) add(doc, YaCySchema.url_paths_sxt, digestURI.getPaths());
@ -506,6 +510,14 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
add(doc, YaCySchema.italic_val, html.getItalicCount(italic));
}
}
final String[] underline = html.getUnderline();
add(doc, YaCySchema.underlinecount_i, underline.length);
if (underline.length > 0) {
add(doc, YaCySchema.underline_txt, underline);
if (allAttr || contains(YaCySchema.underline_val)) {
add(doc, YaCySchema.underline_val, html.getUnderlineCount(underline));
}
}
final String[] li = html.getLi();
add(doc, YaCySchema.licount_i, li.length);
if (li.length > 0) add(doc, YaCySchema.li_txt, li);
@ -867,7 +879,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
}
return a;
}
/**
* register an entry as error document
* @param digestURI

@ -65,6 +65,7 @@ import net.yacy.kelondro.util.Bitfield;
import net.yacy.kelondro.util.SetTools;
import net.yacy.peers.Seed;
import net.yacy.search.index.Segment;
import net.yacy.search.index.SolrConfiguration;
import net.yacy.search.ranking.RankingProfile;
public final class QueryParams {
@ -469,7 +470,7 @@ public final class QueryParams {
final static YaCySchema[] fields = new YaCySchema[]{
YaCySchema.sku,YaCySchema.title,YaCySchema.h1_txt,YaCySchema.h2_txt,
YaCySchema.author,YaCySchema.description,YaCySchema.keywords,YaCySchema.text_t
YaCySchema.author,YaCySchema.description,YaCySchema.keywords,YaCySchema.text_t,YaCySchema.synonyms_sxt
};
final static Map<YaCySchema,Float> boosts = new LinkedHashMap<YaCySchema,Float>();
@ -499,34 +500,8 @@ public final class QueryParams {
public String solrQueryString() {
if (this.solrQueryString != null) return this.solrQueryString;
if (this.query_include_words == null || this.query_include_words.size() == 0) return null;
final StringBuilder q = new StringBuilder(80);
// add text query
int wc = 0;
StringBuilder w = new StringBuilder(80);
for (String s: this.query_include_words) {
if (wc > 0) w.append(" AND ");
w.append(s);
wc++;
}
for (String s: this.query_exclude_words){
if (wc > 0) w.append(" AND -");
w.append(s);
wc++;
}
// combine these queries for all relevant fields
wc = 0;
for (YaCySchema field: fields) {
if (wc > 0) q.append(" OR ");
q.append('(').append(field.name()).append(':').append(w).append(')');
wc++;
}
q.insert(0, '(');
q.append(')');
// add filter to prevent that results come from failed urls
q.append(" AND -").append(YaCySchema.failreason_t.name()).append(":[* TO *]");
// get text query
final StringBuilder q = solrQueryString(this.query_include_words, this.query_exclude_words, this.indexSegment.fulltext().getSolrScheme());
// add constraints
if ( this.sitehash == null ) {
@ -547,7 +522,8 @@ public final class QueryParams {
if (this.radius > 0.0d && this.lat != 0.0d && this.lon != 0.0d) {
// localtion search, no special ranking
q.append('&').append(CommonParams.FQ).append("={!bbox sfield=").append(YaCySchema.coordinate_p.name()).append("}&pt=");
// try http://localhost:8090/solr/select?q=*:*&fq={!bbox}&sfield=coordinate_p&pt=50.17,8.65&d=1
q.append('&').append(CommonParams.FQ).append("=!bbox&sfield=").append(YaCySchema.coordinate_p.name()).append("&pt=");
q.append(Double.toString(this.lat)).append(',').append(Double.toString(this.lon)).append("&d=").append(GeoLocation.degreeToKm(this.radius));
} else {
// set ranking
@ -571,6 +547,40 @@ public final class QueryParams {
return this.solrQueryString;
}
public static StringBuilder solrQueryString(Collection<String> include, Collection<String> exclude, SolrConfiguration configuration) {
final StringBuilder q = new StringBuilder(80);
// add text query
int wc = 0;
StringBuilder w = new StringBuilder(80);
for (String s: include) {
if (wc > 0) w.append(" AND ");
w.append(s);
wc++;
}
for (String s: exclude){
if (wc > 0) w.append(" AND -");
w.append(s);
wc++;
}
// combine these queries for all relevant fields
wc = 0;
for (YaCySchema field: fields) {
if (configuration != null && !configuration.contains(field.name())) continue;
if (wc > 0) q.append(" OR ");
q.append('(').append(field.name()).append(':').append(w).append(')');
wc++;
}
q.insert(0, '(');
q.append(')');
// add filter to prevent that results come from failed urls
q.append(" AND -").append(YaCySchema.failreason_t.name()).append(":[* TO *]");
return q;
}
public String queryStringForUrl() {
try {
return URLEncoder.encode(this.queryString, "UTF-8");

@ -150,7 +150,7 @@ public final class SearchEvent {
this.rankingProcess = new RWIProcess(this.query, this.order, remote);
// start a local solr search
RemoteSearch.solrRemoteSearch(this, 100, 10000, null /*this peer*/, Switchboard.urlBlacklist);
RemoteSearch.solrRemoteSearch(this, Math.min(300, this.query.itemsPerPage() * 3), 10000, null /*this peer*/, Switchboard.urlBlacklist);
// start a local RWI search concurrently
this.rankingProcess.start();

@ -472,7 +472,7 @@ public class serverObjects extends HashMap<String, String> implements Cloneable
public SolrParams toSolrParams(YaCySchema[] facets) {
// check if all required post fields are there
if (!this.containsKey(CommonParams.DF)) this.put(CommonParams.DF, YaCySchema.text_t.name()); // set default field to all fields
if (!this.containsKey(CommonParams.DF)) this.put(CommonParams.DF, YaCySchema.text_t.name()); // set default field to the text field
if (!this.containsKey(CommonParams.START)) this.put(CommonParams.START, "0"); // set default start item
if (!this.containsKey(CommonParams.ROWS)) this.put(CommonParams.ROWS, "10"); // set default number of search results

Loading…
Cancel
Save