Merge branch 'master' of ssh://git@gitorious.org/yacy/rc1.git

pull/1/head
Michael Peter Christen 12 years ago
commit 3b959ee002

@ -119,6 +119,9 @@ responsetime_i
## all visible text, text
text_t
## additional synonyms to the words in the text, text
synonyms_t
## h1 header
h1_txt

@ -67,6 +67,7 @@ public enum YaCySchema implements Schema {
imagescount_i(SolrType.integer, true, true, false, "number of images"),
responsetime_i(SolrType.integer, true, true, false, "response time of target server in milliseconds"),
text_t(SolrType.text_general, true, true, false, "all visible text"),
synonyms_t(SolrType.text_general, true, true, false, "additional synonyms to the words in the text"),
h1_txt(SolrType.text_general, true, true, true, "h1 header"),
h2_txt(SolrType.text_general, true, true, true, "h2 header"),
h3_txt(SolrType.text_general, true, true, true, "h3 header"),

@ -85,7 +85,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
}
//get words from document
final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib, false).words();
final Map<String, Word> words = new Condenser(document, true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, false).words();
// generate potential tags from document title, description and subject
final int bufferSize = document.dc_title().length() + document.dc_description().length() + document.dc_subject(' ').length() + 32;

@ -38,6 +38,7 @@ import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.WordCache;
import net.yacy.cora.document.Classification.ContentDomain;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.language.synonyms.SynonymLibrary;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.document.language.Identificator;
import net.yacy.document.parser.html.ImageEntry;
@ -66,6 +67,7 @@ public final class Condenser {
//private Properties analysis;
private final Map<String, Word> words; // a string (the words) to (indexWord) - relation
private final Map<String, Set<Tagging.Metatag>> tags = new HashMap<String, Set<Tagging.Metatag>>(); // a set of tags, discovered from Autotagging
private final Set<String> synonyms; // a set of synonyms to the words
public int RESULT_NUMB_WORDS = -1;
public int RESULT_DIFF_WORDS = -1;
@ -79,12 +81,14 @@ public final class Condenser {
final boolean indexText,
final boolean indexMedia,
final WordCache meaningLib,
final SynonymLibrary stemming,
final boolean doAutotagging
) {
Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging
// if addMedia == true, then all the media links are also parsed and added to the words
// added media words are flagged with the appropriate media flag
this.words = new HashMap<String, Word>();
this.synonyms = new HashSet<String>();
this.RESULT_FLAGS = new Bitfield(4);
// construct flag set for document
@ -202,6 +206,14 @@ public final class Condenser {
if (!this.tags.isEmpty()) {
document.addMetatags(this.tags);
}
// create the synonyms set
if (stemming != null) {
for (String word: this.words.keySet()) {
Set<String> syms = stemming.getSynonyms(word);
if (syms != null) this.synonyms.addAll(syms);
}
}
}
private void insertTextToWords(
@ -239,6 +251,7 @@ public final class Condenser {
this.languageIdentificator = null; // we don't need that here
// analysis = new Properties();
this.words = new TreeMap<String, Word>();
this.synonyms = new HashSet<String>();
createCondensement(text, meaningLib, doAutotagging);
}
@ -255,6 +268,10 @@ public final class Condenser {
return this.words;
}
public Set<String> synonyms() {
return this.synonyms;
}
public String language() {
return this.languageIdentificator.getLanguage();
}

@ -47,6 +47,7 @@ import net.yacy.cora.document.WordCache;
import net.yacy.cora.geo.GeonamesLocation;
import net.yacy.cora.geo.OpenGeoDBLocation;
import net.yacy.cora.geo.OverarchingLocation;
import net.yacy.cora.language.synonyms.SynonymLibrary;
import net.yacy.cora.lod.JenaTripleStore;
import net.yacy.cora.lod.vocabulary.Tagging;
import net.yacy.cora.lod.vocabulary.Tagging.SOTuple;
@ -61,19 +62,19 @@ public class LibraryProvider {
public static final String path_to_source_dictionaries = "source";
public static final String path_to_did_you_mean_dictionaries = "didyoumean";
public static final String path_to_autotagging_dictionaries = "autotagging";
public static final String path_to_synonym_dictionaries = "synonyms";
public static final String disabledExtension = ".disabled";
public static WordCache dymLib = new WordCache(null);
public static Autotagging autotagging = null;
public static SynonymLibrary synonyms = null;
public static OverarchingLocation geoLoc = new OverarchingLocation();
private static File dictSource = null;
private static File dictRoot = null;
public static enum Dictionary {
GEODB0(
"geo0",
"http://downloads.sourceforge.net/project/opengeodb/Data/0.2.5a/opengeodb-0.2.5a-UTF8-sql.gz" ),
GEODB0( "geo0", "http://downloads.sourceforge.net/project/opengeodb/Data/0.2.5a/opengeodb-0.2.5a-UTF8-sql.gz" ),
GEODB1( "geo1", "http://fa-technik.adfc.de/code/opengeodb/dump/opengeodb-02624_2011-10-17.sql.gz" ),
GEON0( "geon0", "http://download.geonames.org/export/dump/cities1000.zip" ),
GEON1( "geon1", "http://download.geonames.org/export/dump/cities5000.zip" ),
@ -121,6 +122,7 @@ public class LibraryProvider {
initAutotagging();
activateDeReWo();
initDidYouMean();
initSynonyms();
integrateOpenGeoDB();
integrateGeonames0(-1);
integrateGeonames1(-1);
@ -170,7 +172,6 @@ public class LibraryProvider {
return;
}
}
public static void initDidYouMean() {
final File dymDict = new File(dictRoot, path_to_did_you_mean_dictionaries);
if ( !dymDict.exists() ) {
@ -186,7 +187,13 @@ public class LibraryProvider {
}
autotagging = new Autotagging(autotaggingPath);
}
public static void initSynonyms() {
final File synonymPath = new File(dictRoot, path_to_synonym_dictionaries);
if ( !synonymPath.exists() ) {
synonymPath.mkdirs();
}
synonyms = new SynonymLibrary(synonymPath);
}
public static void activateDeReWo() {
// translate input files (once..)
final File dymDict = new File(dictRoot, path_to_did_you_mean_dictionaries);

@ -118,7 +118,7 @@ public class torrentParser extends AbstractParser implements Parser {
byte[] b = FileUtils.read(new File(args[0]));
torrentParser parser = new torrentParser();
Document[] d = parser.parse(new DigestURI("http://localhost/test.torrent"), null, "UTF-8", new ByteArrayInputStream(b));
Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib, false);
Condenser c = new Condenser(d[0], true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, false);
Map<String, Word> w = c.words();
for (Map.Entry<String, Word> e: w.entrySet()) System.out.println("Word: " + e.getKey() + " - " + e.getValue().posInText);
} catch (IOException e) {

@ -2464,9 +2464,10 @@ public final class Switchboard extends serverSwitch
final Condenser[] condenser = new Condenser[in.documents.length];
for ( int i = 0; i < in.documents.length; i++ ) {
condenser[i] =
new Condenser(in.documents[i], in.queueEntry.profile().indexText(), in.queueEntry
.profile()
.indexMedia(), LibraryProvider.dymLib, true);
new Condenser(
in.documents[i], in.queueEntry.profile().indexText(),
in.queueEntry.profile().indexMedia(),
LibraryProvider.dymLib, LibraryProvider.synonyms, true);
// update image result list statistics
// its good to do this concurrently here, because it needs a DNS lookup
@ -2714,7 +2715,7 @@ public final class Switchboard extends serverSwitch
if ( document.indexingDenied() ) {
throw new Parser.Failure("indexing is denied", url);
}
final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, true);
final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, true);
ResultImages.registerImages(url, document, true);
Switchboard.this.webStructure.generateCitationReference(url, document);
storeDocumentIndex(

@ -164,7 +164,7 @@ public class DocumentIndex extends Segment {
int c = 0;
for ( final Document document : documents ) {
if (document == null) continue;
final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, true);
final Condenser condenser = new Condenser(document, true, true, LibraryProvider.dymLib, LibraryProvider.synonyms, true);
rows[c++] =
super.storeDocument(
url,

@ -395,7 +395,7 @@ public class Segment {
// STORE TO SOLR
try {
this.fulltext.putDocument(this.fulltext.getSolrScheme().yacy2solr(id, profile, responseHeader, document, metadata));
this.fulltext.putDocument(this.fulltext.getSolrScheme().yacy2solr(id, profile, responseHeader, document, condenser, metadata));
} catch ( final IOException e ) {
Log.logWarning("SOLR", "failed to send " + urlNormalform + " to solr: " + e.getMessage());
}
@ -517,7 +517,7 @@ public class Segment {
}
// get the word set
Set<String> words = null;
words = new Condenser(document, true, true, null, false).words().keySet();
words = new Condenser(document, true, true, null, null, false).words().keySet();
// delete all word references
int count = 0;

@ -338,7 +338,7 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
if (!text.isEmpty() && text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.');
}
public SolrInputDocument yacy2solr(final String id, final CrawlProfile profile, final ResponseHeader header, final Document yacydoc, final URIMetadata metadata) {
public SolrInputDocument yacy2solr(final String id, final CrawlProfile profile, final ResponseHeader header, final Document yacydoc, Condenser condenser, final URIMetadata metadata) {
// we use the SolrCell design as index scheme
final SolrInputDocument doc = new SolrInputDocument();
final DigestURI digestURI = new DigestURI(yacydoc.dc_source());
@ -416,6 +416,13 @@ public class SolrConfiguration extends ConfigurationSet implements Serializable
final int contentwc = content.split(" ").length;
add(doc, YaCySchema.wordcount_i, contentwc);
}
if (allAttr || contains(YaCySchema.synonyms_t)) {
Set<String> synonyms = condenser.synonyms();
StringBuilder s = new StringBuilder(synonyms.size() * 8);
for (String o: synonyms) s.append(o).append(' ');
if (s.length() > 0) s.setLength(s.length() - 1);
add(doc, YaCySchema.synonyms_t, s.toString());
}
// path elements of link
if (allAttr || contains(YaCySchema.url_paths_sxt)) add(doc, YaCySchema.url_paths_sxt, digestURI.getPaths());

Loading…
Cancel
Save