- migrates all entries in old urldb Metadata coordinate (lat / lon) NumberFormatException still relative often (see excerpt below), - added try/catch for URIMetadataRow (seems not to be needed in URIMetaDataNode, as Solr internally checks for number format) - removed possible typ conversion for lat() / lon() comparison with 0.0f, changed to 0.0 (leaving it to the compiler/optimizer to choose number format) current log excerpt for NumberFormatException: W 2013/01/14 00:10:07 StackTrace For input string: "-" java.lang.NumberFormatException: For input string: "-" at sun.misc.FloatingDecimal.readJavaFormatString(Unknown Source) at java.lang.Double.parseDouble(Unknown Source) at net.yacy.kelondro.data.meta.URIMetadataRow$Components.lon(URIMetadataRow.java:525) at net.yacy.kelondro.data.meta.URIMetadataRow.lon(URIMetadataRow.java:279) at net.yacy.search.index.SolrConfiguration.metadata2solr(SolrConfiguration.java:277) at net.yacy.search.index.Fulltext.putMetadata(Fulltext.java:329) at transferURL.respond(transferURL.java:152) ... Caused by: java.lang.NumberFormatException: For input string: "-" at sun.misc.FloatingDecimal.readJavaFormatString(Unknown Source) at java.lang.Double.parseDouble(Unknown Source) at net.yacy.kelondro.data.meta.URIMetadataRow$Components.lon(URIMetadataRow.java:525) at net.yacy.kelondro.data.meta.URIMetadataRow.lon(URIMetadataRow.java:279) at net.yacy.search.index.SolrConfiguration.metadata2solr(SolrConfiguration.java:277) at net.yacy.search.index.Fulltext.putMetadata(Fulltext.java:329) at transferURL.respond(transferURL.java:152)pull/1/head
parent
3b6e08b49f
commit
3897bb4409
@ -0,0 +1,36 @@
|
|||||||
|
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
|
||||||
|
<html xmlns="http://www.w3.org/1999/xhtml">
|
||||||
|
<head>
|
||||||
|
<title>Migrate URLdb</title>
|
||||||
|
#%env/templates/metas.template%#
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
#%env/templates/header.template%#
|
||||||
|
#%env/templates/simpleheader.template%#
|
||||||
|
|
||||||
|
<h2>Migrate URLdb to embedded Solr Index</h2>
|
||||||
|
|
||||||
|
<p>Convert old meta data (urldb) index to embedded Solr fulltext index.</p>
|
||||||
|
|
||||||
|
<dl>
|
||||||
|
<dd>
|
||||||
|
<p>A low priority background job has been started which reads the old index, adds it to Solr and deletes the entry from the old index.</p>
|
||||||
|
<p>The default "slow migration" updates any entry in the old urldb index upon access (e.g. during search events).<br />
|
||||||
|
If you feel that the not accessed entries are still relevant, with this migration all entries from the old urldb index will be migrated.</p>
|
||||||
|
<p>You may refresh this page to see how many entries in the old index are left for migration</p>
|
||||||
|
<p>Hint: this background task runs until all entries are migrated or YaCy is shutdown. The migration is not automatically restarted.</p>
|
||||||
|
</dd>
|
||||||
|
</dl>
|
||||||
|
<form action="migrateurldb_p.html" method="post" enctype="multipart/form-data" accept-charset="UTF-8">
|
||||||
|
<fieldset>
|
||||||
|
<input type="hidden" name="lastcount" value="#[lastcount]#" />
|
||||||
|
<input type="hidden" name="lasttime" value="#[lasttime]#" />
|
||||||
|
|
||||||
|
<p><b>#[count]# entries</b> in old index left to migrate.</p>
|
||||||
|
<p>For large indexes this may run for a long time (migration speed: #[speed]# entries per minute) <input type="submit" name="dorefresh" value="refresh" /></p>
|
||||||
|
</fieldset>
|
||||||
|
</form>
|
||||||
|
|
||||||
|
#%env/templates/footer.template%#
|
||||||
|
</body>
|
||||||
|
</html>
|
@ -0,0 +1,44 @@
|
|||||||
|
// migrateurldb_p.java
|
||||||
|
|
||||||
|
import net.yacy.cora.protocol.RequestHeader;
|
||||||
|
import net.yacy.migration;
|
||||||
|
import net.yacy.search.Switchboard;
|
||||||
|
import net.yacy.server.serverObjects;
|
||||||
|
import net.yacy.server.serverSwitch;
|
||||||
|
|
||||||
|
public class migrateurldb_p {
|
||||||
|
|
||||||
|
public static serverObjects respond(@SuppressWarnings("unused") final RequestHeader header, final serverObjects post, @SuppressWarnings("unused") final serverSwitch env) {
|
||||||
|
final serverObjects prop = new serverObjects();
|
||||||
|
final Switchboard sb = (Switchboard) env;
|
||||||
|
|
||||||
|
int cnt;
|
||||||
|
|
||||||
|
if ((cnt = migration.migrateUrldbtoSolr(sb)) > 0) {
|
||||||
|
prop.put("count", cnt);
|
||||||
|
|
||||||
|
if (post != null && post.containsKey("dorefresh")) {
|
||||||
|
int lastcount = post.getInt("lastcount", 0);
|
||||||
|
Long t = post.getLong("lasttime", 1);
|
||||||
|
|
||||||
|
Double difft = (System.currentTimeMillis() - t) / 60000.0d;
|
||||||
|
int diff = (int)((lastcount - cnt) / difft) ;
|
||||||
|
prop.put("speed", diff);
|
||||||
|
prop.put("lasttime", t);
|
||||||
|
prop.put("lastcount", lastcount);
|
||||||
|
|
||||||
|
} else {
|
||||||
|
prop.put("speed", "?");
|
||||||
|
prop.put("lastcount",cnt);
|
||||||
|
prop.put("lasttime", System.currentTimeMillis());
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
prop.put("speed", "");
|
||||||
|
prop.put("count", "no urldb index available");
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// return rewrite properties
|
||||||
|
return prop;
|
||||||
|
}
|
||||||
|
}
|
@ -1,495 +1,495 @@
|
|||||||
/**
|
/**
|
||||||
* Condenser.java
|
* Condenser.java
|
||||||
* Copyright 2004 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
|
* Copyright 2004 by Michael Peter Christen, mc@yacy.net, Frankfurt am Main, Germany
|
||||||
* First released 09.01.2004 at http://yacy.net
|
* First released 09.01.2004 at http://yacy.net
|
||||||
*
|
*
|
||||||
* This library is free software; you can redistribute it and/or
|
* This library is free software; you can redistribute it and/or
|
||||||
* modify it under the terms of the GNU Lesser General Public
|
* modify it under the terms of the GNU Lesser General Public
|
||||||
* License as published by the Free Software Foundation; either
|
* License as published by the Free Software Foundation; either
|
||||||
* version 2.1 of the License, or (at your option) any later version.
|
* version 2.1 of the License, or (at your option) any later version.
|
||||||
*
|
*
|
||||||
* This library is distributed in the hope that it will be useful,
|
* This library is distributed in the hope that it will be useful,
|
||||||
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||||
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||||
* Lesser General Public License for more details.
|
* Lesser General Public License for more details.
|
||||||
*
|
*
|
||||||
* You should have received a copy of the GNU Lesser General Public License
|
* You should have received a copy of the GNU Lesser General Public License
|
||||||
* along with this program in the file lgpl21.txt
|
* along with this program in the file lgpl21.txt
|
||||||
* If not, see <http://www.gnu.org/licenses/>.
|
* If not, see <http://www.gnu.org/licenses/>.
|
||||||
*/
|
*/
|
||||||
|
|
||||||
package net.yacy.document;
|
package net.yacy.document;
|
||||||
|
|
||||||
import java.io.File;
|
import java.io.File;
|
||||||
import java.io.FileInputStream;
|
import java.io.FileInputStream;
|
||||||
import java.io.FileNotFoundException;
|
import java.io.FileNotFoundException;
|
||||||
import java.io.IOException;
|
import java.io.IOException;
|
||||||
import java.util.ArrayList;
|
import java.util.ArrayList;
|
||||||
import java.util.HashMap;
|
import java.util.HashMap;
|
||||||
import java.util.HashSet;
|
import java.util.HashSet;
|
||||||
import java.util.Iterator;
|
import java.util.Iterator;
|
||||||
import java.util.LinkedHashSet;
|
import java.util.LinkedHashSet;
|
||||||
import java.util.List;
|
import java.util.List;
|
||||||
import java.util.Locale;
|
import java.util.Locale;
|
||||||
import java.util.Map;
|
import java.util.Map;
|
||||||
import java.util.Properties;
|
import java.util.Properties;
|
||||||
import java.util.Set;
|
import java.util.Set;
|
||||||
import java.util.SortedSet;
|
import java.util.SortedSet;
|
||||||
import java.util.TreeMap;
|
import java.util.TreeMap;
|
||||||
|
|
||||||
import org.apache.solr.common.params.MapSolrParams;
|
import org.apache.solr.common.params.MapSolrParams;
|
||||||
import org.apache.solr.update.processor.Lookup3Signature;
|
import org.apache.solr.update.processor.Lookup3Signature;
|
||||||
|
|
||||||
import net.yacy.cora.document.ASCII;
|
import net.yacy.cora.document.ASCII;
|
||||||
import net.yacy.cora.document.WordCache;
|
import net.yacy.cora.document.WordCache;
|
||||||
import net.yacy.cora.document.analysis.Classification.ContentDomain;
|
import net.yacy.cora.document.analysis.Classification.ContentDomain;
|
||||||
import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
|
import net.yacy.cora.document.analysis.EnhancedTextProfileSignature;
|
||||||
import net.yacy.cora.document.MultiProtocolURI;
|
import net.yacy.cora.document.MultiProtocolURI;
|
||||||
import net.yacy.cora.federate.solr.Boost;
|
import net.yacy.cora.federate.solr.Boost;
|
||||||
import net.yacy.cora.language.synonyms.SynonymLibrary;
|
import net.yacy.cora.language.synonyms.SynonymLibrary;
|
||||||
import net.yacy.cora.lod.vocabulary.Tagging;
|
import net.yacy.cora.lod.vocabulary.Tagging;
|
||||||
import net.yacy.document.language.Identificator;
|
import net.yacy.document.language.Identificator;
|
||||||
import net.yacy.document.parser.html.ImageEntry;
|
import net.yacy.document.parser.html.ImageEntry;
|
||||||
import net.yacy.kelondro.data.word.Word;
|
import net.yacy.kelondro.data.word.Word;
|
||||||
import net.yacy.kelondro.data.word.WordReferenceRow;
|
import net.yacy.kelondro.data.word.WordReferenceRow;
|
||||||
import net.yacy.kelondro.logging.Log;
|
import net.yacy.kelondro.logging.Log;
|
||||||
import net.yacy.kelondro.util.Bitfield;
|
import net.yacy.kelondro.util.Bitfield;
|
||||||
import net.yacy.kelondro.util.SetTools;
|
import net.yacy.kelondro.util.SetTools;
|
||||||
|
|
||||||
|
|
||||||
public final class Condenser {
|
public final class Condenser {
|
||||||
|
|
||||||
// this is the page analysis class
|
// this is the page analysis class
|
||||||
public final static boolean pseudostemming = false; // switch for removal of words that appear in shortened form
|
public final static boolean pseudostemming = false; // switch for removal of words that appear in shortened form
|
||||||
public final static int wordminsize = 2;
|
public final static int wordminsize = 2;
|
||||||
public final static int wordcut = 2;
|
public final static int wordcut = 2;
|
||||||
|
|
||||||
// category flags that show how the page can be distinguished in different interest groups
|
// category flags that show how the page can be distinguished in different interest groups
|
||||||
public static final int flag_cat_indexof = 0; // a directory listing page (i.e. containing 'index of')
|
public static final int flag_cat_indexof = 0; // a directory listing page (i.e. containing 'index of')
|
||||||
public static final int flag_cat_haslocation = 19; // the page has a location metadata attached
|
public static final int flag_cat_haslocation = 19; // the page has a location metadata attached
|
||||||
public static final int flag_cat_hasimage = 20; // the page refers to (at least one) images
|
public static final int flag_cat_hasimage = 20; // the page refers to (at least one) images
|
||||||
public static final int flag_cat_hasaudio = 21; // the page refers to (at least one) audio file
|
public static final int flag_cat_hasaudio = 21; // the page refers to (at least one) audio file
|
||||||
public static final int flag_cat_hasvideo = 22; // the page refers to (at least one) videos
|
public static final int flag_cat_hasvideo = 22; // the page refers to (at least one) videos
|
||||||
public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file
|
public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file
|
||||||
|
|
||||||
//private Properties analysis;
|
//private Properties analysis;
|
||||||
private final Map<String, Word> words; // a string (the words) to (indexWord) - relation
|
private final Map<String, Word> words; // a string (the words) to (indexWord) - relation
|
||||||
private final Map<String, Set<Tagging.Metatag>> tags = new HashMap<String, Set<Tagging.Metatag>>(); // a set of tags, discovered from Autotagging
|
private final Map<String, Set<Tagging.Metatag>> tags = new HashMap<String, Set<Tagging.Metatag>>(); // a set of tags, discovered from Autotagging
|
||||||
private final Set<String> synonyms; // a set of synonyms to the words
|
private final Set<String> synonyms; // a set of synonyms to the words
|
||||||
private long fuzzy_signature = 0, exact_signature = 0; // signatures for double-check detection
|
private long fuzzy_signature = 0, exact_signature = 0; // signatures for double-check detection
|
||||||
private String fuzzy_signature_text = null; // signatures for double-check detection
|
private String fuzzy_signature_text = null; // signatures for double-check detection
|
||||||
|
|
||||||
public int RESULT_NUMB_WORDS = -1;
|
public int RESULT_NUMB_WORDS = -1;
|
||||||
public int RESULT_DIFF_WORDS = -1;
|
public int RESULT_DIFF_WORDS = -1;
|
||||||
public int RESULT_NUMB_SENTENCES = -1;
|
public int RESULT_NUMB_SENTENCES = -1;
|
||||||
public int RESULT_DIFF_SENTENCES = -1;
|
public int RESULT_DIFF_SENTENCES = -1;
|
||||||
public Bitfield RESULT_FLAGS = new Bitfield(4);
|
public Bitfield RESULT_FLAGS = new Bitfield(4);
|
||||||
private final Identificator languageIdentificator;
|
private final Identificator languageIdentificator;
|
||||||
|
|
||||||
public Condenser(
|
public Condenser(
|
||||||
final Document document,
|
final Document document,
|
||||||
final boolean indexText,
|
final boolean indexText,
|
||||||
final boolean indexMedia,
|
final boolean indexMedia,
|
||||||
final WordCache meaningLib,
|
final WordCache meaningLib,
|
||||||
final SynonymLibrary synlib,
|
final SynonymLibrary synlib,
|
||||||
final boolean doAutotagging
|
final boolean doAutotagging
|
||||||
) {
|
) {
|
||||||
Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging
|
Thread.currentThread().setName("condenser-" + document.dc_identifier()); // for debugging
|
||||||
// if addMedia == true, then all the media links are also parsed and added to the words
|
// if addMedia == true, then all the media links are also parsed and added to the words
|
||||||
// added media words are flagged with the appropriate media flag
|
// added media words are flagged with the appropriate media flag
|
||||||
this.words = new HashMap<String, Word>();
|
this.words = new HashMap<String, Word>();
|
||||||
this.synonyms = new LinkedHashSet<String>();
|
this.synonyms = new LinkedHashSet<String>();
|
||||||
this.RESULT_FLAGS = new Bitfield(4);
|
this.RESULT_FLAGS = new Bitfield(4);
|
||||||
|
|
||||||
// construct flag set for document
|
// construct flag set for document
|
||||||
if (document.dc_source().getContentDomain() == ContentDomain.IMAGE || !document.getImages().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasimage, true);
|
if (document.dc_source().getContentDomain() == ContentDomain.IMAGE || !document.getImages().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasimage, true);
|
||||||
if (document.dc_source().getContentDomain() == ContentDomain.AUDIO || !document.getAudiolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasaudio, true);
|
if (document.dc_source().getContentDomain() == ContentDomain.AUDIO || !document.getAudiolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasaudio, true);
|
||||||
if (document.dc_source().getContentDomain() == ContentDomain.VIDEO || !document.getVideolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasvideo, true);
|
if (document.dc_source().getContentDomain() == ContentDomain.VIDEO || !document.getVideolinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasvideo, true);
|
||||||
if (document.dc_source().getContentDomain() == ContentDomain.APP || !document.getApplinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasapp, true);
|
if (document.dc_source().getContentDomain() == ContentDomain.APP || !document.getApplinks().isEmpty()) this.RESULT_FLAGS.set(flag_cat_hasapp, true);
|
||||||
if (document.lat() != 0.0f && document.lon() != 0.0f) this.RESULT_FLAGS.set(flag_cat_haslocation, true);
|
if (document.lat() != 0.0 && document.lon() != 0.0) this.RESULT_FLAGS.set(flag_cat_haslocation, true);
|
||||||
|
|
||||||
this.languageIdentificator = new Identificator();
|
this.languageIdentificator = new Identificator();
|
||||||
|
|
||||||
// add the URL components to the word list
|
// add the URL components to the word list
|
||||||
insertTextToWords(new SentenceReader(document.dc_source().toTokens()), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib);
|
insertTextToWords(new SentenceReader(document.dc_source().toTokens()), 0, WordReferenceRow.flag_app_dc_identifier, this.RESULT_FLAGS, false, meaningLib);
|
||||||
|
|
||||||
Map.Entry<MultiProtocolURI, String> entry;
|
Map.Entry<MultiProtocolURI, String> entry;
|
||||||
if (indexText) {
|
if (indexText) {
|
||||||
createCondensement(document.getTextString(), meaningLib, doAutotagging);
|
createCondensement(document.getTextString(), meaningLib, doAutotagging);
|
||||||
// the phrase counter:
|
// the phrase counter:
|
||||||
// phrase 0 are words taken from the URL
|
// phrase 0 are words taken from the URL
|
||||||
// phrase 1 is the MainTitle
|
// phrase 1 is the MainTitle
|
||||||
// phrase 2 is <not used>
|
// phrase 2 is <not used>
|
||||||
// phrase 3 is the Document Abstract
|
// phrase 3 is the Document Abstract
|
||||||
// phrase 4 is the Document Author
|
// phrase 4 is the Document Author
|
||||||
// phrase 5 is the Document Publisher
|
// phrase 5 is the Document Publisher
|
||||||
// phrase 6 are the tags specified in document
|
// phrase 6 are the tags specified in document
|
||||||
// phrase 10 and above are the section headlines/titles (88 possible)
|
// phrase 10 and above are the section headlines/titles (88 possible)
|
||||||
// phrase 98 is taken from the embedded anchor/hyperlinks description (REMOVED!)
|
// phrase 98 is taken from the embedded anchor/hyperlinks description (REMOVED!)
|
||||||
// phrase 99 is taken from the media Link url and anchor description
|
// phrase 99 is taken from the media Link url and anchor description
|
||||||
// phrase 100 and above are lines from the text
|
// phrase 100 and above are lines from the text
|
||||||
insertTextToWords(new SentenceReader(document.dc_title()), 1, WordReferenceRow.flag_app_dc_title, this.RESULT_FLAGS, true, meaningLib);
|
insertTextToWords(new SentenceReader(document.dc_title()), 1, WordReferenceRow.flag_app_dc_title, this.RESULT_FLAGS, true, meaningLib);
|
||||||
insertTextToWords(new SentenceReader(document.dc_description()), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
|
insertTextToWords(new SentenceReader(document.dc_description()), 3, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
|
||||||
insertTextToWords(new SentenceReader(document.dc_creator()), 4, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
|
insertTextToWords(new SentenceReader(document.dc_creator()), 4, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
|
||||||
insertTextToWords(new SentenceReader(document.dc_publisher()), 5, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
|
insertTextToWords(new SentenceReader(document.dc_publisher()), 5, WordReferenceRow.flag_app_dc_creator, this.RESULT_FLAGS, true, meaningLib);
|
||||||
insertTextToWords(new SentenceReader(document.dc_subject(' ')), 6, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
|
insertTextToWords(new SentenceReader(document.dc_subject(' ')), 6, WordReferenceRow.flag_app_dc_description, this.RESULT_FLAGS, true, meaningLib);
|
||||||
// missing: tags!
|
// missing: tags!
|
||||||
final String[] titles = document.getSectionTitles();
|
final String[] titles = document.getSectionTitles();
|
||||||
for (int i = 0; i < titles.length; i++) {
|
for (int i = 0; i < titles.length; i++) {
|
||||||
insertTextToWords(new SentenceReader(titles[i]), i + 10, WordReferenceRow.flag_app_emphasized, this.RESULT_FLAGS, true, meaningLib);
|
insertTextToWords(new SentenceReader(titles[i]), i + 10, WordReferenceRow.flag_app_emphasized, this.RESULT_FLAGS, true, meaningLib);
|
||||||
}
|
}
|
||||||
|
|
||||||
// anchors: for text indexing we add only the anchor description
|
// anchors: for text indexing we add only the anchor description
|
||||||
// REMOVED! Reason:
|
// REMOVED! Reason:
|
||||||
// words from the anchor description should appear as normal text in the output from the parser
|
// words from the anchor description should appear as normal text in the output from the parser
|
||||||
// to flag these words as appearance in dc_description would confuse, since the user expects such word as titles of
|
// to flag these words as appearance in dc_description would confuse, since the user expects such word as titles of
|
||||||
// pages that are shown in the search result. The words from the URLS should also not appear as part of the index, because they
|
// pages that are shown in the search result. The words from the URLS should also not appear as part of the index, because they
|
||||||
// are not visible in the text and could be used to crate fake-content
|
// are not visible in the text and could be used to crate fake-content
|
||||||
/*
|
/*
|
||||||
final Iterator<Map.Entry<yacyURL, String>> i = document.getAnchors().entrySet().iterator();
|
final Iterator<Map.Entry<yacyURL, String>> i = document.getAnchors().entrySet().iterator();
|
||||||
while (i.hasNext()) {
|
while (i.hasNext()) {
|
||||||
entry = i.next();
|
entry = i.next();
|
||||||
if ((entry == null) || (entry.getKey() == null)) continue;
|
if ((entry == null) || (entry.getKey() == null)) continue;
|
||||||
insertTextToWords(entry.getValue(), 98, indexRWIEntry.flag_app_dc_description, RESULT_FLAGS, true);
|
insertTextToWords(entry.getValue(), 98, indexRWIEntry.flag_app_dc_description, RESULT_FLAGS, true);
|
||||||
}
|
}
|
||||||
*/
|
*/
|
||||||
} else {
|
} else {
|
||||||
this.RESULT_NUMB_WORDS = 0;
|
this.RESULT_NUMB_WORDS = 0;
|
||||||
this.RESULT_DIFF_WORDS = 0;
|
this.RESULT_DIFF_WORDS = 0;
|
||||||
this.RESULT_NUMB_SENTENCES = 0;
|
this.RESULT_NUMB_SENTENCES = 0;
|
||||||
this.RESULT_DIFF_SENTENCES = 0;
|
this.RESULT_DIFF_SENTENCES = 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (indexMedia) {
|
if (indexMedia) {
|
||||||
// add anchor descriptions: here, we also add the url components
|
// add anchor descriptions: here, we also add the url components
|
||||||
// audio
|
// audio
|
||||||
Iterator<Map.Entry<MultiProtocolURI, String>> i = document.getAudiolinks().entrySet().iterator();
|
Iterator<Map.Entry<MultiProtocolURI, String>> i = document.getAudiolinks().entrySet().iterator();
|
||||||
while (i.hasNext()) {
|
while (i.hasNext()) {
|
||||||
entry = i.next();
|
entry = i.next();
|
||||||
insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib);
|
insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasaudio, this.RESULT_FLAGS, false, meaningLib);
|
||||||
insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasaudio, this.RESULT_FLAGS, true, meaningLib);
|
insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasaudio, this.RESULT_FLAGS, true, meaningLib);
|
||||||
}
|
}
|
||||||
|
|
||||||
// video
|
// video
|
||||||
i = document.getVideolinks().entrySet().iterator();
|
i = document.getVideolinks().entrySet().iterator();
|
||||||
while (i.hasNext()) {
|
while (i.hasNext()) {
|
||||||
entry = i.next();
|
entry = i.next();
|
||||||
insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasvideo, this.RESULT_FLAGS, false, meaningLib);
|
insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasvideo, this.RESULT_FLAGS, false, meaningLib);
|
||||||
insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasvideo, this.RESULT_FLAGS, true, meaningLib);
|
insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasvideo, this.RESULT_FLAGS, true, meaningLib);
|
||||||
}
|
}
|
||||||
|
|
||||||
// applications
|
// applications
|
||||||
i = document.getApplinks().entrySet().iterator();
|
i = document.getApplinks().entrySet().iterator();
|
||||||
while (i.hasNext()) {
|
while (i.hasNext()) {
|
||||||
entry = i.next();
|
entry = i.next();
|
||||||
insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasapp, this.RESULT_FLAGS, false, meaningLib);
|
insertTextToWords(new SentenceReader(entry.getKey().toNormalform(true)), 99, flag_cat_hasapp, this.RESULT_FLAGS, false, meaningLib);
|
||||||
insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasapp, this.RESULT_FLAGS, true, meaningLib);
|
insertTextToWords(new SentenceReader(entry.getValue()), 99, flag_cat_hasapp, this.RESULT_FLAGS, true, meaningLib);
|
||||||
}
|
}
|
||||||
|
|
||||||
// images
|
// images
|
||||||
final Iterator<ImageEntry> j = document.getImages().values().iterator();
|
final Iterator<ImageEntry> j = document.getImages().values().iterator();
|
||||||
ImageEntry ientry;
|
ImageEntry ientry;
|
||||||
MultiProtocolURI url;
|
MultiProtocolURI url;
|
||||||
while (j.hasNext()) {
|
while (j.hasNext()) {
|
||||||
ientry = j.next();
|
ientry = j.next();
|
||||||
url = ientry.url();
|
url = ientry.url();
|
||||||
if (url == null) continue;
|
if (url == null) continue;
|
||||||
insertTextToWords(new SentenceReader(url.toNormalform(true)), 99, flag_cat_hasimage, this.RESULT_FLAGS, false, meaningLib);
|
insertTextToWords(new SentenceReader(url.toNormalform(true)), 99, flag_cat_hasimage, this.RESULT_FLAGS, false, meaningLib);
|
||||||
insertTextToWords(new SentenceReader(ientry.alt()), 99, flag_cat_hasimage, this.RESULT_FLAGS, true, meaningLib);
|
insertTextToWords(new SentenceReader(ientry.alt()), 99, flag_cat_hasimage, this.RESULT_FLAGS, true, meaningLib);
|
||||||
}
|
}
|
||||||
|
|
||||||
// finally check all words for missing flag entry
|
// finally check all words for missing flag entry
|
||||||
final Iterator<Map.Entry<String, Word>> k = this.words.entrySet().iterator();
|
final Iterator<Map.Entry<String, Word>> k = this.words.entrySet().iterator();
|
||||||
Word wprop;
|
Word wprop;
|
||||||
Map.Entry<String, Word> we;
|
Map.Entry<String, Word> we;
|
||||||
while (k.hasNext()) {
|
while (k.hasNext()) {
|
||||||
we = k.next();
|
we = k.next();
|
||||||
wprop = we.getValue();
|
wprop = we.getValue();
|
||||||
if (wprop.flags == null) {
|
if (wprop.flags == null) {
|
||||||
wprop.flags = this.RESULT_FLAGS.clone();
|
wprop.flags = this.RESULT_FLAGS.clone();
|
||||||
this.words.put(we.getKey(), wprop);
|
this.words.put(we.getKey(), wprop);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// extend the tags in the document object with autotagging tags
|
// extend the tags in the document object with autotagging tags
|
||||||
if (!this.tags.isEmpty()) {
|
if (!this.tags.isEmpty()) {
|
||||||
document.addMetatags(this.tags);
|
document.addMetatags(this.tags);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (synlib != null) {
|
if (synlib != null) {
|
||||||
for (String word: this.words.keySet()) {
|
for (String word: this.words.keySet()) {
|
||||||
Set<String> syms = synlib.getSynonyms(word);
|
Set<String> syms = synlib.getSynonyms(word);
|
||||||
if (syms != null) this.synonyms.addAll(syms);
|
if (syms != null) this.synonyms.addAll(syms);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
String text = document.getTextString();
|
String text = document.getTextString();
|
||||||
|
|
||||||
// create the synonyms set
|
// create the synonyms set
|
||||||
if (synonyms != null) {
|
if (synonyms != null) {
|
||||||
for (String word: this.words.keySet()) {
|
for (String word: this.words.keySet()) {
|
||||||
Set<String> syms = synlib.getSynonyms(word);
|
Set<String> syms = synlib.getSynonyms(word);
|
||||||
if (syms != null) this.synonyms.addAll(syms);
|
if (syms != null) this.synonyms.addAll(syms);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// create hashes for duplicate detection
|
// create hashes for duplicate detection
|
||||||
// check dups with http://localhost:8090/solr/select?q=*:*&start=0&rows=3&fl=sku,fuzzy_signature_text_t,fuzzy_signature_l,fuzzy_signature_unique_b
|
// check dups with http://localhost:8090/solr/select?q=*:*&start=0&rows=3&fl=sku,fuzzy_signature_text_t,fuzzy_signature_l,fuzzy_signature_unique_b
|
||||||
EnhancedTextProfileSignature fuzzySignatureFactory = new EnhancedTextProfileSignature();
|
EnhancedTextProfileSignature fuzzySignatureFactory = new EnhancedTextProfileSignature();
|
||||||
Map<String,String> sp = new HashMap<String,String>();
|
Map<String,String> sp = new HashMap<String,String>();
|
||||||
sp.put("quantRate", Float.toString(Boost.RANKING.getQuantRate())); // for minTokenLen = 2 the value should not be below 0.24; for minTokenLen = 3 the value must be not below 0.5!
|
sp.put("quantRate", Float.toString(Boost.RANKING.getQuantRate())); // for minTokenLen = 2 the value should not be below 0.24; for minTokenLen = 3 the value must be not below 0.5!
|
||||||
sp.put("minTokenLen", Integer.toString(Boost.RANKING.getMinTokenLen()));
|
sp.put("minTokenLen", Integer.toString(Boost.RANKING.getMinTokenLen()));
|
||||||
fuzzySignatureFactory.init(new MapSolrParams(sp));
|
fuzzySignatureFactory.init(new MapSolrParams(sp));
|
||||||
fuzzySignatureFactory.add(text);
|
fuzzySignatureFactory.add(text);
|
||||||
byte[] fuzzy_signature_hash = fuzzySignatureFactory.getSignature();
|
byte[] fuzzy_signature_hash = fuzzySignatureFactory.getSignature();
|
||||||
long l = 0; for (int i = 0; i < 8; i++) l = (l << 8) + (fuzzy_signature_hash[i] & 0xff);
|
long l = 0; for (int i = 0; i < 8; i++) l = (l << 8) + (fuzzy_signature_hash[i] & 0xff);
|
||||||
this.fuzzy_signature = l;
|
this.fuzzy_signature = l;
|
||||||
this.fuzzy_signature_text = fuzzySignatureFactory.getSignatureText().toString();
|
this.fuzzy_signature_text = fuzzySignatureFactory.getSignatureText().toString();
|
||||||
Lookup3Signature exactSignatureFactory = new Lookup3Signature();
|
Lookup3Signature exactSignatureFactory = new Lookup3Signature();
|
||||||
exactSignatureFactory.add(text);
|
exactSignatureFactory.add(text);
|
||||||
byte[] exact_signature_hash = exactSignatureFactory.getSignature();
|
byte[] exact_signature_hash = exactSignatureFactory.getSignature();
|
||||||
l = 0; for (int i = 0; i < 8; i++) l = (l << 8) + (exact_signature_hash[i] & 0xff);
|
l = 0; for (int i = 0; i < 8; i++) l = (l << 8) + (exact_signature_hash[i] & 0xff);
|
||||||
this.exact_signature = l;
|
this.exact_signature = l;
|
||||||
}
|
}
|
||||||
|
|
||||||
private Condenser(final String text, final WordCache meaningLib, boolean doAutotagging) {
|
private Condenser(final String text, final WordCache meaningLib, boolean doAutotagging) {
|
||||||
this.languageIdentificator = null; // we don't need that here
|
this.languageIdentificator = null; // we don't need that here
|
||||||
// analysis = new Properties();
|
// analysis = new Properties();
|
||||||
this.words = new TreeMap<String, Word>();
|
this.words = new TreeMap<String, Word>();
|
||||||
this.synonyms = new HashSet<String>();
|
this.synonyms = new HashSet<String>();
|
||||||
createCondensement(text, meaningLib, doAutotagging);
|
createCondensement(text, meaningLib, doAutotagging);
|
||||||
}
|
}
|
||||||
|
|
||||||
private void insertTextToWords(
|
private void insertTextToWords(
|
||||||
final SentenceReader text,
|
final SentenceReader text,
|
||||||
final int phrase,
|
final int phrase,
|
||||||
final int flagpos,
|
final int flagpos,
|
||||||
final Bitfield flagstemplate,
|
final Bitfield flagstemplate,
|
||||||
final boolean useForLanguageIdentification,
|
final boolean useForLanguageIdentification,
|
||||||
final WordCache meaningLib) {
|
final WordCache meaningLib) {
|
||||||
if (text == null) return;
|
if (text == null) return;
|
||||||
String word;
|
String word;
|
||||||
Word wprop;
|
Word wprop;
|
||||||
WordTokenizer wordenum = new WordTokenizer(text, meaningLib);
|
WordTokenizer wordenum = new WordTokenizer(text, meaningLib);
|
||||||
try {
|
try {
|
||||||
int pip = 0;
|
int pip = 0;
|
||||||
while (wordenum.hasMoreElements()) {
|
while (wordenum.hasMoreElements()) {
|
||||||
word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH);
|
word = (wordenum.nextElement().toString()).toLowerCase(Locale.ENGLISH);
|
||||||
if (useForLanguageIdentification) this.languageIdentificator.add(word);
|
if (useForLanguageIdentification) this.languageIdentificator.add(word);
|
||||||
if (word.length() < 2) continue;
|
if (word.length() < 2) continue;
|
||||||
wprop = this.words.get(word);
|
wprop = this.words.get(word);
|
||||||
if (wprop == null) wprop = new Word(0, pip, phrase);
|
if (wprop == null) wprop = new Word(0, pip, phrase);
|
||||||
if (wprop.flags == null) wprop.flags = flagstemplate.clone();
|
if (wprop.flags == null) wprop.flags = flagstemplate.clone();
|
||||||
wprop.flags.set(flagpos, true);
|
wprop.flags.set(flagpos, true);
|
||||||
this.words.put(word, wprop);
|
this.words.put(word, wprop);
|
||||||
pip++;
|
pip++;
|
||||||
this.RESULT_NUMB_WORDS++;
|
this.RESULT_NUMB_WORDS++;
|
||||||
this.RESULT_DIFF_WORDS++;
|
this.RESULT_DIFF_WORDS++;
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
wordenum.close();
|
wordenum.close();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
public int excludeWords(final SortedSet<String> stopwords) {
|
public int excludeWords(final SortedSet<String> stopwords) {
|
||||||
// subtracts the given stopwords from the word list
|
// subtracts the given stopwords from the word list
|
||||||
// the word list shrinkes. This returns the number of shrinked words
|
// the word list shrinkes. This returns the number of shrinked words
|
||||||
final int oldsize = this.words.size();
|
final int oldsize = this.words.size();
|
||||||
SetTools.excludeDestructive(this.words, stopwords);
|
SetTools.excludeDestructive(this.words, stopwords);
|
||||||
return oldsize - this.words.size();
|
return oldsize - this.words.size();
|
||||||
}
|
}
|
||||||
|
|
||||||
public Map<String, Word> words() {
|
public Map<String, Word> words() {
|
||||||
// returns the words as word/indexWord relation map
|
// returns the words as word/indexWord relation map
|
||||||
return this.words;
|
return this.words;
|
||||||
}
|
}
|
||||||
|
|
||||||
public List<String> synonyms() {
|
public List<String> synonyms() {
|
||||||
ArrayList<String> l = new ArrayList<String>(this.synonyms.size());
|
ArrayList<String> l = new ArrayList<String>(this.synonyms.size());
|
||||||
for (String s: this.synonyms) l.add(s);
|
for (String s: this.synonyms) l.add(s);
|
||||||
return l;
|
return l;
|
||||||
}
|
}
|
||||||
|
|
||||||
public long fuzzySignature() {
|
public long fuzzySignature() {
|
||||||
return this.fuzzy_signature;
|
return this.fuzzy_signature;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String fuzzySignatureText() {
|
public String fuzzySignatureText() {
|
||||||
return this.fuzzy_signature_text;
|
return this.fuzzy_signature_text;
|
||||||
}
|
}
|
||||||
|
|
||||||
public long exactSignature() {
|
public long exactSignature() {
|
||||||
return this.exact_signature;
|
return this.exact_signature;
|
||||||
}
|
}
|
||||||
|
|
||||||
public String language() {
|
public String language() {
|
||||||
return this.languageIdentificator.getLanguage();
|
return this.languageIdentificator.getLanguage();
|
||||||
}
|
}
|
||||||
|
|
||||||
private void createCondensement(final String text, final WordCache meaningLib, boolean doAutotagging) {
|
private void createCondensement(final String text, final WordCache meaningLib, boolean doAutotagging) {
|
||||||
assert text != null;
|
assert text != null;
|
||||||
final Set<String> currsentwords = new HashSet<String>();
|
final Set<String> currsentwords = new HashSet<String>();
|
||||||
String word = "";
|
String word = "";
|
||||||
String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1];
|
String[] wordcache = new String[LibraryProvider.autotagging.getMaxWordsInTerm() - 1];
|
||||||
for (int i = 0; i < wordcache.length; i++) wordcache[i] = "";
|
for (int i = 0; i < wordcache.length; i++) wordcache[i] = "";
|
||||||
String k;
|
String k;
|
||||||
Tagging.Metatag tag;
|
Tagging.Metatag tag;
|
||||||
int wordlen;
|
int wordlen;
|
||||||
Word wsp;
|
Word wsp;
|
||||||
final Word wsp1;
|
final Word wsp1;
|
||||||
int wordHandle;
|
int wordHandle;
|
||||||
int wordHandleCount = 0;
|
int wordHandleCount = 0;
|
||||||
final int sentenceHandleCount = 0;
|
final int sentenceHandleCount = 0;
|
||||||
int allwordcounter = 0;
|
int allwordcounter = 0;
|
||||||
final int allsentencecounter = 0;
|
final int allsentencecounter = 0;
|
||||||
int wordInSentenceCounter = 1;
|
int wordInSentenceCounter = 1;
|
||||||
boolean comb_indexof = false, last_last = false, last_index = false;
|
boolean comb_indexof = false, last_last = false, last_index = false;
|
||||||
final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
|
final Map<StringBuilder, Phrase> sentences = new HashMap<StringBuilder, Phrase>(100);
|
||||||
if (LibraryProvider.autotagging.isEmpty()) doAutotagging = false;
|
if (LibraryProvider.autotagging.isEmpty()) doAutotagging = false;
|
||||||
|
|
||||||
// read source
|
// read source
|
||||||
final WordTokenizer wordenum = new WordTokenizer(new SentenceReader(text), meaningLib);
|
final WordTokenizer wordenum = new WordTokenizer(new SentenceReader(text), meaningLib);
|
||||||
try {
|
try {
|
||||||
while (wordenum.hasMoreElements()) {
|
while (wordenum.hasMoreElements()) {
|
||||||
word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH);
|
word = wordenum.nextElement().toString().toLowerCase(Locale.ENGLISH);
|
||||||
if (this.languageIdentificator != null) this.languageIdentificator.add(word);
|
if (this.languageIdentificator != null) this.languageIdentificator.add(word);
|
||||||
if (word.length() < wordminsize) continue;
|
if (word.length() < wordminsize) continue;
|
||||||
|
|
||||||
// get tags from autotagging
|
// get tags from autotagging
|
||||||
if (doAutotagging) {
|
if (doAutotagging) {
|
||||||
for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) {
|
for (int wordc = 1; wordc <= wordcache.length + 1; wordc++) {
|
||||||
// wordc is number of words that are tested
|
// wordc is number of words that are tested
|
||||||
StringBuilder sb = new StringBuilder();
|
StringBuilder sb = new StringBuilder();
|
||||||
if (wordc == 1) {
|
if (wordc == 1) {
|
||||||
sb.append(word);
|
sb.append(word);
|
||||||
} else {
|
} else {
|
||||||
for (int w = 0; w < wordc - 1; w++) {
|
for (int w = 0; w < wordc - 1; w++) {
|
||||||
sb.append(wordcache[wordcache.length - wordc + w + 1]).append(' ');
|
sb.append(wordcache[wordcache.length - wordc + w + 1]).append(' ');
|
||||||
}
|
}
|
||||||
sb.append(word);
|
sb.append(word);
|
||||||
}
|
}
|
||||||
String testterm = sb.toString().trim();
|
String testterm = sb.toString().trim();
|
||||||
//System.out.println("Testing: " + testterm);
|
//System.out.println("Testing: " + testterm);
|
||||||
tag = LibraryProvider.autotagging.getTagFromTerm(testterm);
|
tag = LibraryProvider.autotagging.getTagFromTerm(testterm);
|
||||||
if (tag != null) {
|
if (tag != null) {
|
||||||
String navigatorName = tag.getVocabularyName();
|
String navigatorName = tag.getVocabularyName();
|
||||||
Set<Tagging.Metatag> tagset = this.tags.get(navigatorName);
|
Set<Tagging.Metatag> tagset = this.tags.get(navigatorName);
|
||||||
if (tagset == null) {
|
if (tagset == null) {
|
||||||
tagset = new HashSet<Tagging.Metatag>();
|
tagset = new HashSet<Tagging.Metatag>();
|
||||||
this.tags.put(navigatorName, tagset);
|
this.tags.put(navigatorName, tagset);
|
||||||
}
|
}
|
||||||
tagset.add(tag);
|
tagset.add(tag);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// shift wordcache
|
// shift wordcache
|
||||||
System.arraycopy(wordcache, 1, wordcache, 0, wordcache.length - 1);
|
System.arraycopy(wordcache, 1, wordcache, 0, wordcache.length - 1);
|
||||||
wordcache[wordcache.length - 1] = word;
|
wordcache[wordcache.length - 1] = word;
|
||||||
|
|
||||||
// distinguish punctuation and words
|
// distinguish punctuation and words
|
||||||
wordlen = word.length();
|
wordlen = word.length();
|
||||||
if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) {
|
if (wordlen == 1 && SentenceReader.punctuation(word.charAt(0))) {
|
||||||
// store sentence
|
// store sentence
|
||||||
currsentwords.clear();
|
currsentwords.clear();
|
||||||
wordInSentenceCounter = 1;
|
wordInSentenceCounter = 1;
|
||||||
} else {
|
} else {
|
||||||
// check index.of detection
|
// check index.of detection
|
||||||
if (last_last && comb_indexof && word.equals("modified")) {
|
if (last_last && comb_indexof && word.equals("modified")) {
|
||||||
this.RESULT_FLAGS.set(flag_cat_indexof, true);
|
this.RESULT_FLAGS.set(flag_cat_indexof, true);
|
||||||
wordenum.pre(true); // parse lines as they come with CRLF
|
wordenum.pre(true); // parse lines as they come with CRLF
|
||||||
}
|
}
|
||||||
if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true;
|
if (last_index && (wordminsize > 2 || word.equals("of"))) comb_indexof = true;
|
||||||
last_last = word.equals("last");
|
last_last = word.equals("last");
|
||||||
last_index = word.equals("index");
|
last_index = word.equals("index");
|
||||||
|
|
||||||
// store word
|
// store word
|
||||||
allwordcounter++;
|
allwordcounter++;
|
||||||
currsentwords.add(word);
|
currsentwords.add(word);
|
||||||
wsp = this.words.get(word);
|
wsp = this.words.get(word);
|
||||||
if (wsp != null) {
|
if (wsp != null) {
|
||||||
// word already exists
|
// word already exists
|
||||||
wordHandle = wsp.posInText;
|
wordHandle = wsp.posInText;
|
||||||
wsp.inc();
|
wsp.inc();
|
||||||
} else {
|
} else {
|
||||||
// word does not yet exist, create new word entry
|
// word does not yet exist, create new word entry
|
||||||
wordHandle = wordHandleCount++;
|
wordHandle = wordHandleCount++;
|
||||||
wsp = new Word(wordHandle, wordInSentenceCounter, sentences.size() + 100);
|
wsp = new Word(wordHandle, wordInSentenceCounter, sentences.size() + 100);
|
||||||
wsp.flags = this.RESULT_FLAGS.clone();
|
wsp.flags = this.RESULT_FLAGS.clone();
|
||||||
this.words.put(word, wsp);
|
this.words.put(word, wsp);
|
||||||
}
|
}
|
||||||
// we now have the unique handle of the word, put it into the sentence:
|
// we now have the unique handle of the word, put it into the sentence:
|
||||||
wordInSentenceCounter++;
|
wordInSentenceCounter++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
} finally {
|
} finally {
|
||||||
wordenum.close();
|
wordenum.close();
|
||||||
}
|
}
|
||||||
|
|
||||||
if (pseudostemming) {
|
if (pseudostemming) {
|
||||||
Map.Entry<String, Word> entry;
|
Map.Entry<String, Word> entry;
|
||||||
// we search for similar words and reorganize the corresponding sentences
|
// we search for similar words and reorganize the corresponding sentences
|
||||||
// a word is similar, if a shortened version is equal
|
// a word is similar, if a shortened version is equal
|
||||||
final Iterator<Map.Entry<String, Word>> wi = this.words.entrySet().iterator(); // enumerates the keys in descending order
|
final Iterator<Map.Entry<String, Word>> wi = this.words.entrySet().iterator(); // enumerates the keys in descending order
|
||||||
wordsearch: while (wi.hasNext()) {
|
wordsearch: while (wi.hasNext()) {
|
||||||
entry = wi.next();
|
entry = wi.next();
|
||||||
word = entry.getKey();
|
word = entry.getKey();
|
||||||
wordlen = word.length();
|
wordlen = word.length();
|
||||||
wsp = entry.getValue();
|
wsp = entry.getValue();
|
||||||
for (int i = wordcut; i > 0; i--) {
|
for (int i = wordcut; i > 0; i--) {
|
||||||
if (wordlen > i) {
|
if (wordlen > i) {
|
||||||
k = word.substring(0, wordlen - i);
|
k = word.substring(0, wordlen - i);
|
||||||
if (this.words.containsKey(k)) {
|
if (this.words.containsKey(k)) {
|
||||||
// update word counter
|
// update word counter
|
||||||
wsp1.count = wsp1.count + wsp.count;
|
wsp1.count = wsp1.count + wsp.count;
|
||||||
this.words.put(k, wsp1);
|
this.words.put(k, wsp1);
|
||||||
// remove current word
|
// remove current word
|
||||||
wi.remove();
|
wi.remove();
|
||||||
continue wordsearch;
|
continue wordsearch;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// store result
|
// store result
|
||||||
//this.RESULT_NUMB_TEXT_BYTES = wordenum.count();
|
//this.RESULT_NUMB_TEXT_BYTES = wordenum.count();
|
||||||
this.RESULT_NUMB_WORDS = allwordcounter;
|
this.RESULT_NUMB_WORDS = allwordcounter;
|
||||||
this.RESULT_DIFF_WORDS = wordHandleCount;
|
this.RESULT_DIFF_WORDS = wordHandleCount;
|
||||||
this.RESULT_NUMB_SENTENCES = allsentencecounter;
|
this.RESULT_NUMB_SENTENCES = allsentencecounter;
|
||||||
this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
|
this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
public static Map<String, Word> getWords(final String text, final WordCache meaningLib) {
|
public static Map<String, Word> getWords(final String text, final WordCache meaningLib) {
|
||||||
// returns a word/indexWord relation map
|
// returns a word/indexWord relation map
|
||||||
if (text == null) return null;
|
if (text == null) return null;
|
||||||
return new Condenser(text, meaningLib, false).words();
|
return new Condenser(text, meaningLib, false).words();
|
||||||
}
|
}
|
||||||
|
|
||||||
public static void main(final String[] args) {
|
public static void main(final String[] args) {
|
||||||
// read a property file and convert them into configuration lines
|
// read a property file and convert them into configuration lines
|
||||||
try {
|
try {
|
||||||
final File f = new File(args[0]);
|
final File f = new File(args[0]);
|
||||||
final Properties p = new Properties();
|
final Properties p = new Properties();
|
||||||
p.load(new FileInputStream(f));
|
p.load(new FileInputStream(f));
|
||||||
final StringBuilder sb = new StringBuilder();
|
final StringBuilder sb = new StringBuilder();
|
||||||
sb.append("{\n");
|
sb.append("{\n");
|
||||||
for (int i = 0; i <= 15; i++) {
|
for (int i = 0; i <= 15; i++) {
|
||||||
sb.append('"');
|
sb.append('"');
|
||||||
final String s = p.getProperty("keywords" + i);
|
final String s = p.getProperty("keywords" + i);
|
||||||
final String[] l = s.split(",");
|
final String[] l = s.split(",");
|
||||||
for (final String element : l) {
|
for (final String element : l) {
|
||||||
sb.append(ASCII.String(Word.word2hash(element)));
|
sb.append(ASCII.String(Word.word2hash(element)));
|
||||||
}
|
}
|
||||||
if (i < 15) sb.append(",\n");
|
if (i < 15) sb.append(",\n");
|
||||||
}
|
}
|
||||||
sb.append("}\n");
|
sb.append("}\n");
|
||||||
System.out.println(sb.toString());
|
System.out.println(sb.toString());
|
||||||
} catch (final FileNotFoundException e) {
|
} catch (final FileNotFoundException e) {
|
||||||
Log.logException(e);
|
Log.logException(e);
|
||||||
} catch (final IOException e) {
|
} catch (final IOException e) {
|
||||||
Log.logException(e);
|
Log.logException(e);
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in new issue