luc 9 years ago
commit bfe51001e3

@ -31,7 +31,7 @@
<classpathentry kind="lib" path="lib/jcl-over-slf4j-1.7.12.jar"/>
<classpathentry kind="lib" path="lib/slf4j-jdk14-1.7.12.jar"/>
<classpathentry kind="lib" path="lib/log4j-over-slf4j-1.7.12.jar"/>
<classpathentry kind="lib" path="lib/httpcore-4.4.3.jar"/>
<classpathentry kind="lib" path="lib/httpcore-4.4.4.jar"/>
<classpathentry kind="lib" path="lib/metadata-extractor-2.8.1.jar"/>
<classpathentry kind="lib" path="lib/xmpcore-5.1.2.jar"/>
<classpathentry kind="lib" path="lib/jcifs-1.3.17.jar"/>
@ -92,12 +92,11 @@
<classpathentry kind="lib" path="lib/jsoup-1.8.3.jar"/>
<classpathentry kind="lib" path="lib/javax.servlet-api-3.1.0.jar"/>
<classpathentry kind="lib" path="lib/weupnp-0.1.3.jar"/>
<classpathentry kind="lib" path="lib/common-image-3.1.2.jar"/>
<classpathentry kind="lib" path="lib/common-io-3.1.2.jar"/>
<classpathentry kind="lib" path="lib/common-lang-3.1.2.jar"/>
<classpathentry kind="lib" path="lib/imageio-core-3.1.2.jar"/>
<classpathentry kind="lib" path="lib/imageio-metadata-3.1.2.jar"/>
<classpathentry kind="lib" path="lib/imageio-tiff-3.1.2.jar"/>
<classpathentry kind="lib" path="lib/servlet-3.1.2.jar"/>
<classpathentry kind="lib" path="lib/common-image-3.2.jar"/>
<classpathentry kind="lib" path="lib/common-io-3.2.jar"/>
<classpathentry kind="lib" path="lib/common-lang-3.2.jar"/>
<classpathentry kind="lib" path="lib/imageio-core-3.2.jar"/>
<classpathentry kind="lib" path="lib/imageio-metadata-3.2.jar"/>
<classpathentry kind="lib" path="lib/imageio-tiff-3.2.jar"/>
<classpathentry kind="output" path="gen"/>
</classpath>

@ -165,9 +165,9 @@
<pathelement location="${lib}/bcmail-jdk15-1.46.jar" />
<pathelement location="${lib}/bcprov-jdk15-1.46.jar" />
<pathelement location="${lib}/chardet.jar" />
<pathelement location="${lib}/common-image-3.1.2.jar" />
<pathelement location="${lib}/common-io-3.1.2.jar" />
<pathelement location="${lib}/common-lang-3.1.2.jar" />
<pathelement location="${lib}/common-image-3.2.jar" />
<pathelement location="${lib}/common-io-3.2.jar" />
<pathelement location="${lib}/common-lang-3.2.jar" />
<pathelement location="${lib}/commons-codec-1.10.jar" />
<pathelement location="${lib}/commons-compress-1.10.jar" />
<pathelement location="${lib}/commons-fileupload-1.3.1.jar" />
@ -179,12 +179,12 @@
<pathelement location="${lib}/guava-18.0.jar" />
<pathelement location="${lib}/htmllexer.jar" />
<pathelement location="${lib}/httpclient-4.5.1.jar" />
<pathelement location="${lib}/httpcore-4.4.3.jar" />
<pathelement location="${lib}/httpcore-4.4.4.jar" />
<pathelement location="${lib}/httpmime-4.5.1.jar" />
<pathelement location="${lib}/icu4j-56_1.jar" />
<pathelement location="${lib}/imageio-core-3.1.2.jar" />
<pathelement location="${lib}/imageio-metadata-3.1.2.jar" />
<pathelement location="${lib}/imageio-tiff-3.1.2.jar" />
<pathelement location="${lib}/imageio-core-3.2.jar" />
<pathelement location="${lib}/imageio-metadata-3.2.jar" />
<pathelement location="${lib}/imageio-tiff-3.2.jar" />
<pathelement location="${lib}/J7Zip-modified.jar" />
<pathelement location="${lib}/jakarta-oro-2.0.8.jar" />
<pathelement location="${lib}/jaudiotagger-2.0.4-20111207.115108-15.jar" />
@ -232,7 +232,6 @@
<pathelement location="${lib}/pdfbox-1.8.10.jar" />
<pathelement location="${lib}/poi-3.13-20150929.jar" />
<pathelement location="${lib}/poi-scratchpad-3.13-20150929.jar" />
<pathelement location="${lib}/servlet-3.1.2.jar" />
<pathelement location="${lib}/slf4j-api-1.7.12.jar" />
<pathelement location="${lib}/slf4j-jdk14-1.7.12.jar" />
<pathelement location="${lib}/solr-core-5.3.1.jar" />

@ -28,7 +28,7 @@
</p>
<h3>Active list: #(disabled)#
#{blackLists}#
&nbsp;[&nbsp;<em>#[name]#</em>&nbsp;]&nbsp;
&nbsp;[&nbsp;<em><a href="Blacklist_p.html?selectList&selectedListName=#[name]#">#[name]#</a></em>&nbsp;]&nbsp;
#{/blackLists}#
::<span class="error">No blacklist selected</span>#(/disabled)#</h3>
<!-- blacklist selection -->

@ -114,12 +114,11 @@ public class getpageinfo {
prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString());
// put keywords
final String list[] = scraper.dc_subject();
final Set<String> list = scraper.dc_subject();
int count = 0;
for (final String element: list) {
final String tag = element;
if (!tag.equals("")) {
prop.putXML("tags_"+count+"_tag", tag);
if (!element.equals("")) {
prop.putXML("tags_"+count+"_tag", element);
count++;
}
}

@ -114,12 +114,11 @@ public class getpageinfo_p {
prop.put("favicon", (scraper.getFavicon()==null) ? "" : scraper.getFavicon().toString());
// put keywords
final String list[] = scraper.dc_subject();
final Set<String> list = scraper.dc_subject();
int count = 0;
for (final String element: list) {
final String tag = element;
if (!tag.equals("")) {
prop.putXML("tags_"+count+"_tag", tag);
if (!element.equals("")) {
prop.putXML("tags_"+count+"_tag", element);
count++;
}
}

@ -9,7 +9,6 @@
"title": "Search for #[rss_query]#",
"link": "#[searchBaseURL]#?query=#[rss_queryenc]#&amp;resource=#[resource]#&amp;contentdom=#[contentdom]#"
},
"totalResults": "#[num-results_totalcount]#",
"startIndex": "#[num-results_offset]#",
"itemsPerPage": "#[num-results_itemsPerPage]#",
"searchTerms": "#[rss_queryenc]#",

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

@ -2166,6 +2166,21 @@ reindex documents containing these fields:==Re-Indiziere Dokumente die folgende
#"is empty"=="ist leer"
#"no reindex job running"=="Es läuft kein Re-Indexierungs-Job"
#"! reindex works only with embedded Solr index !"=="! Re-Indexierung funktioniert nur mit eingebautem Solr Index !"
Re-Crawl Index Documents==Re-Crawl Index Dokumente
Searches the local index and selects documents to add to the crawler (recrawl the document).==Durchsucht und selektiert Dokumente im lokalen Index und fügt diese dem Crawler hinzu (Dokumente erneut crawlen).
This runs transparent as background job.==Dies läuft transparent als Hintergrund Job.
Documents are added to the crawler only if no other crawls are active==Dokumente werden dem Crawler hinzugefügt, wenn kein anderer Crawl-Job aktiv ist
and are added in small chunks.==und wird in kleinen Blöcken verarbeitet.
"start recrawl job now"=="Starte Re-Crawl-Job jetzt"
"stop recrawl job"=="Beende Re-Crawl-Job"
to re-crawl documents with fresh_date_dt before today.==um Dokumente mit fresh_date_dt vor Heute erneut zu crawlen.
after starting the recrawl job you can apply a custom Solr query to select documents to be processed==nach dem Start des Re-Crawl-Jobs kann die Solr Abfrage bearbeitet werden um gewünschte Dokumente zu verarbeiten
Re-Crawl Query Details==Re-Crawl Abfrage Details
Documents to process==Dokumente in Warteschlange
Current Query==Aktuelle Abfrage
Edit Solr Query==Edit Solr Abfrage
update==aktualisieren
include failed urls==inklusive Fehler-Urls
#-----------------------------
#File: Load_MediawikiWiki.html

@ -83,7 +83,7 @@
<compilation-unit>
<package-root>source</package-root>
<package-root>htroot</package-root>
<classpath mode="compile">lib/J7Zip-modified.jar;lib/apache-mime4j-0.6.jar;lib/bcmail-jdk15-1.46.jar;lib/bcprov-jdk15-1.46.jar;lib/chardet.jar;lib/common-image-3.1.2.jar;lib/common-io-3.1.2.jar;lib/common-lang-3.1.2.jar;lib/commons-codec-1.10.jar;lib/commons-compress-1.10.jar;lib/commons-fileupload-1.3.1.jar;lib/commons-io-2.4.jar;lib/commons-jxpath-1.3.jar;lib/commons-lang-2.6.jar;lib/commons-logging-1.2.jar;lib/fontbox-1.8.10.jar;lib/guava-18.0.jar;lib/htmllexer.jar;lib/httpclient-4.5.1.jar;lib/httpcore-4.4.3.jar;lib/httpmime-4.5.1.jar;lib/icu4j-56_1.jar;lib/imageio-core-3.1.2.jar;lib/imageio-metadata-3.1.2.jar;lib/imageio-tiff-3.1.2.jar;lib/jakarta-oro-2.0.8.jar;lib/jaudiotagger-2.0.4-20111207.115108-15.jar;lib/javax.servlet-api-3.1.0.jar;lib/jcifs-1.3.17.jar;lib/jcl-over-slf4j-1.7.12.jar;lib/jempbox-1.8.10.jar;lib/jetty-client-9.2.13.v20150730.jar;lib/jetty-continuation-9.2.13.v20150730.jar;lib/jetty-deploy-9.2.13.v20150730.jar;lib/jetty-http-9.2.13.v20150730.jar;lib/jetty-io-9.2.13.v20150730.jar;lib/jetty-jmx-9.2.13.v20150730.jar;lib/jetty-proxy-9.2.13.v20150730.jar;lib/jetty-security-9.2.13.v20150730.jar;lib/jetty-server-9.2.13.v20150730.jar;lib/jetty-servlet-9.2.13.v20150730.jar;lib/jetty-servlets-9.2.13.v20150730.jar;lib/jetty-util-9.2.13.v20150730.jar;lib/jetty-webapp-9.2.13.v20150730.jar;lib/jetty-xml-9.2.13.v20150730.jar;lib/jsch-0.1.53.jar;lib/json-simple-1.1.1.jar;lib/jsoup-1.8.3.jar;lib/log4j-over-slf4j-1.7.12.jar;lib/lucene-analyzers-common-5.3.1.jar;lib/lucene-analyzers-phonetic-5.3.1.jar;lib/lucene-backward-codecs-5.3.1.jar;lib/lucene-classification-5.3.1.jar;lib/lucene-codecs-5.3.1.jar;lib/lucene-core-5.3.1.jar;lib/lucene-facet-5.3.1.jar;lib/lucene-grouping-5.3.1.jar;lib/lucene-highlighter-5.3.1.jar;lib/lucene-join-5.3.1.jar;lib/lucene-memory-5.3.1.jar;lib/lucene-misc-5.3.1.jar;lib/lucene-queries-5.3.1.jar;lib/lucene-queryparser-5.3.1.jar;lib/lucene-spatial-5.3.1.jar;lib/lucene-suggest-5.3.1.jar;lib/metadata-extractor-2.8.1.jar;lib/noggit-0.6.jar;lib/org.restlet.jar;lib/pdfbox-1.8.10.jar;lib/poi-3.13-20150929.jar;lib/poi-scratchpad-3.13-20150929.jar;lib/servlet-3.1.2.jar;lib/slf4j-api-1.7.12.jar;lib/slf4j-jdk14-1.7.12.jar;lib/solr-core-5.3.1.jar;lib/solr-solrj-5.3.1.jar;lib/spatial4j-0.4.1.jar;lib/stax2-api-3.1.4.jar;lib/webcat-0.1-swf.jar;lib/weupnp-0.1.3.jar;lib/woodstox-core-asl-4.4.1.jar;lib/xercesImpl.jar;lib/xml-apis.jar;lib/xmpcore-5.1.2.jar;lib/zookeeper-3.4.6.jar</classpath>
<classpath mode="compile">lib/J7Zip-modified.jar;lib/apache-mime4j-0.6.jar;lib/bcmail-jdk15-1.46.jar;lib/bcprov-jdk15-1.46.jar;lib/chardet.jar;lib/common-image-3.2.jar;lib/common-io-3.2.jar;lib/common-lang-3.2.jar;lib/commons-codec-1.10.jar;lib/commons-compress-1.10.jar;lib/commons-fileupload-1.3.1.jar;lib/commons-io-2.4.jar;lib/commons-jxpath-1.3.jar;lib/commons-lang-2.6.jar;lib/commons-logging-1.2.jar;lib/fontbox-1.8.10.jar;lib/guava-18.0.jar;lib/htmllexer.jar;lib/httpclient-4.5.1.jar;lib/httpcore-4.4.4.jar;lib/httpmime-4.5.1.jar;lib/icu4j-56_1.jar;lib/imageio-core-3.2.jar;lib/imageio-metadata-3.2.jar;lib/imageio-tiff-3.2.jar;lib/jakarta-oro-2.0.8.jar;lib/jaudiotagger-2.0.4-20111207.115108-15.jar;lib/javax.servlet-api-3.1.0.jar;lib/jcifs-1.3.17.jar;lib/jcl-over-slf4j-1.7.12.jar;lib/jempbox-1.8.10.jar;lib/jetty-client-9.2.13.v20150730.jar;lib/jetty-continuation-9.2.13.v20150730.jar;lib/jetty-deploy-9.2.13.v20150730.jar;lib/jetty-http-9.2.13.v20150730.jar;lib/jetty-io-9.2.13.v20150730.jar;lib/jetty-jmx-9.2.13.v20150730.jar;lib/jetty-proxy-9.2.13.v20150730.jar;lib/jetty-security-9.2.13.v20150730.jar;lib/jetty-server-9.2.13.v20150730.jar;lib/jetty-servlet-9.2.13.v20150730.jar;lib/jetty-servlets-9.2.13.v20150730.jar;lib/jetty-util-9.2.13.v20150730.jar;lib/jetty-webapp-9.2.13.v20150730.jar;lib/jetty-xml-9.2.13.v20150730.jar;lib/jsch-0.1.53.jar;lib/json-simple-1.1.1.jar;lib/jsoup-1.8.3.jar;lib/log4j-over-slf4j-1.7.12.jar;lib/lucene-analyzers-common-5.3.1.jar;lib/lucene-analyzers-phonetic-5.3.1.jar;lib/lucene-backward-codecs-5.3.1.jar;lib/lucene-classification-5.3.1.jar;lib/lucene-codecs-5.3.1.jar;lib/lucene-core-5.3.1.jar;lib/lucene-facet-5.3.1.jar;lib/lucene-grouping-5.3.1.jar;lib/lucene-highlighter-5.3.1.jar;lib/lucene-join-5.3.1.jar;lib/lucene-memory-5.3.1.jar;lib/lucene-misc-5.3.1.jar;lib/lucene-queries-5.3.1.jar;lib/lucene-queryparser-5.3.1.jar;lib/lucene-spatial-5.3.1.jar;lib/lucene-suggest-5.3.1.jar;lib/metadata-extractor-2.8.1.jar;lib/noggit-0.6.jar;lib/org.restlet.jar;lib/pdfbox-1.8.10.jar;lib/poi-3.13-20150929.jar;lib/poi-scratchpad-3.13-20150929.jar;lib/slf4j-api-1.7.12.jar;lib/slf4j-jdk14-1.7.12.jar;lib/solr-core-5.3.1.jar;lib/solr-solrj-5.3.1.jar;lib/spatial4j-0.4.1.jar;lib/stax2-api-3.1.4.jar;lib/webcat-0.1-swf.jar;lib/weupnp-0.1.3.jar;lib/woodstox-core-asl-4.4.1.jar;lib/xercesImpl.jar;lib/xml-apis.jar;lib/xmpcore-5.1.2.jar;lib/zookeeper-3.4.6.jar</classpath>
<built-to>lib/yacycore.jar</built-to>
<source-level>1.7</source-level>
</compilation-unit>

@ -395,11 +395,6 @@
<artifactId>commons-logging</artifactId>
<version>1.2</version>
</dependency>
<dependency>
<groupId>com.twelvemonkeys.imageio</groupId>
<artifactId>imageio-tiff</artifactId>
<version>3.1.2</version>
</dependency>
<dependency>
<groupId>org.apache.pdfbox</groupId>
<artifactId>fontbox</artifactId>
@ -435,6 +430,11 @@
<artifactId>icu4j</artifactId>
<version>56.1</version>
</dependency>
<dependency>
<groupId>com.twelvemonkeys.imageio</groupId>
<artifactId>imageio-tiff</artifactId>
<version>3.2</version>
</dependency>
<dependency>
<groupId>org</groupId>
<artifactId>jaudiotagger</artifactId>

@ -971,8 +971,12 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
return toNormalform(false);
}
/**
* Tokenized url as string (without the protocol)
* @return example "host com path file ext"
*/
public String toTokens() {
return toTokens(unescape(this.toNormalform(true)));
return toTokens(unescape(this.urlstub(true,true)));
}
/**
@ -1105,6 +1109,15 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
return result;
}
/**
* Generates a normal form of the url, without the protocol part,
* except the skipped protocol part this is identical with toNormalform()
* @see #toNormalform(boolean)
* @param excludeAnchor, exclude anchor part
* @param removeSessionID, exclude session id
* @return example "www.host.com:8080/path/file.html"
* @see #toNormalform(boolean, boolean)
*/
public String urlstub(final boolean excludeAnchor, final boolean removeSessionID) {
// generates a normal form of the URL
boolean defaultPort = false;

@ -46,7 +46,6 @@ import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.TreeSet;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@ -71,7 +70,7 @@ public class Document {
private DigestURL source; // the source url
private final String mimeType; // mimeType as taken from http header
private final String charset; // the charset of the document
private final List<String> keywords; // most resources provide a keyword field
private final Set<String> keywords; // most resources provide a keyword field
private List<String> titles; // the document titles, taken from title and/or h1 tag; shall appear as headline of search result
private final StringBuilder creator; // author or copyright
private final String publisher; // publisher
@ -115,7 +114,7 @@ public class Document {
this.mimeType = (mimeType == null) ? "application/octet-stream" : mimeType;
this.charset = charset;
this.parserObject = parserObject;
this.keywords = new LinkedList<String>();
this.keywords = new LinkedHashSet<String>();
if (keywords != null) this.keywords.addAll(Arrays.asList(keywords));
this.titles = (titles == null) ? new ArrayList<String>(1) : titles;
this.creator = (author == null) ? new StringBuilder(0) : new StringBuilder(author);
@ -214,6 +213,10 @@ dc_coverage
dc_rights
*/
/**
* Get the main document title. This is the 1st in the list of titles.
* @return title_string (may return null or empty string)
*/
public String dc_title() {
return (this.titles == null || this.titles.size() == 0) ? "" : this.titles.iterator().next();
}
@ -222,6 +225,10 @@ dc_rights
return this.titles;
}
/**
* Sets the title of the document, replacing any existing titles.
* @param title
*/
public void setTitle(final String title) {
this.titles = new ArrayList<String>();
if (title != null) this.titles.add(title);
@ -239,11 +246,8 @@ dc_rights
* @param tags
*/
public void addTags(Set<String> tags) {
for (String s: this.keywords) {
tags.remove(s);
}
for (String s: tags) {
this.keywords.add(s);
if (s != null && !s.isEmpty()) this.keywords.add(s);
}
}
@ -274,28 +278,27 @@ dc_rights
}
return gf;
}
public String[] dc_subject() {
// sort out doubles and empty words
final TreeSet<String> hs = new TreeSet<String>();
String s;
for (int i = 0; i < this.keywords.size(); i++) {
if (this.keywords.get(i) == null) continue;
s = (this.keywords.get(i)).trim();
if (!s.isEmpty()) hs.add(s);
}
final String[] t = new String[hs.size()];
int i = 0;
for (final String u: hs) t[i++] = u;
return t;
/**
* Get the set of keywords associated with the document
* @return set of unique keywords
*/
public Set<String> dc_subject() {
return this.keywords;
}
/**
* Get the set of keywords associated with the document and string
* each keyword separated by the separator character
*
* @param separator character
* @return string of keywords or empty string
*/
public String dc_subject(final char separator) {
final String[] t = dc_subject();
if (t.length == 0) return "";
if (this.keywords.size() == 0) return "";
// generate a new list
final StringBuilder sb = new StringBuilder(t.length * 8);
for (final String s: t) sb.append(s).append(separator);
final StringBuilder sb = new StringBuilder(this.keywords.size() * 8);
for (final String s: this.keywords) sb.append(s).append(separator);
return sb.substring(0, sb.length() - 1);
}
@ -427,10 +430,6 @@ dc_rights
return sentences;
}
public List<String> getKeywords() {
return this.keywords;
}
public Collection<AnchorURL> getAnchors() {
// returns all links embedded as anchors (clickeable entities)
// this is a url(String)/text(String) map
@ -688,7 +687,7 @@ dc_rights
for (final Document doc: docs) {
this.sections.addAll(doc.sections);
this.titles.addAll(doc.titles());
this.keywords.addAll(doc.getKeywords());
this.keywords.addAll(doc.dc_subject());
for (String d: doc.dc_description()) this.descriptions.add(d);
if (!(this.text instanceof ByteArrayOutputStream)) {

@ -271,7 +271,6 @@ public class psParser extends AbstractParser implements Parser {
try {
// creating a tempfile
tempFile = FileUtils.createTempFile(this.getClass(), "temp.ps");
tempFile.deleteOnExit();
// copying inputstream into file
FileUtils.copy(source,tempFile);

@ -118,7 +118,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
final byte[] urlHash,
final int urlLength, // byte-length of complete URL
final int urlComps, // number of path components
final int titleLength, // length of description/length (longer are better?)
final int titlewordcount,// length of description/length (longer are better?)
final int hitcount, // how often appears this word in the text
final int wordcount, // total number of words
final int phrasecount, // total number of phrases
@ -141,7 +141,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
this.entry.setCol(col_urlhash, urlHash);
this.entry.setCol(col_lastModified, mddlm);
this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation
this.entry.setCol(col_wordsInTitle, titleLength / 6); // word count estimation; TODO: change value handover to number of words
this.entry.setCol(col_wordsInTitle, titlewordcount);
this.entry.setCol(col_wordsInText, wordcount);
this.entry.setCol(col_phrasesInText, phrasecount);
this.entry.setCol(col_doctype, new byte[]{(byte) doctype});
@ -163,7 +163,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
public WordReferenceRow(final byte[] urlHash,
final int urlLength, // byte-length of complete URL
final int urlComps, // number of path components
final int titleLength, // length of description/length (longer are better?)
final int titlewordcount,// length of description/length (longer are better?)
final int wordcount, // total number of words
final int phrasecount, // total number of phrases
final long lastmodified, // last-modified time of the document where word appears
@ -180,7 +180,7 @@ public final class WordReferenceRow extends AbstractReference implements WordRef
this.entry.setCol(col_urlhash, urlHash);
this.entry.setCol(col_lastModified, mddlm);
this.entry.setCol(col_freshUntil, Math.max(0, mddlm + (mddct - mddlm) * 2)); // TTL computation
this.entry.setCol(col_wordsInTitle, titleLength / 6); // word count estimation; TODO: change value handover to number of words
this.entry.setCol(col_wordsInTitle, titlewordcount);
this.entry.setCol(col_wordsInText, wordcount);
this.entry.setCol(col_phrasesInText, phrasecount);
this.entry.setCol(col_doctype, new byte[]{(byte) doctype});

@ -34,11 +34,9 @@ import java.util.concurrent.LinkedBlockingQueue;
import net.yacy.cora.date.MicroDate;
import net.yacy.cora.document.encoding.ASCII;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.util.ByteArray;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.data.meta.URIMetadataNode;
import net.yacy.kelondro.index.Row;
import net.yacy.kelondro.index.Row.Entry;
import net.yacy.kelondro.rwi.AbstractReference;
@ -50,11 +48,11 @@ import net.yacy.kelondro.workflow.WorkflowProcessor;
public class WordReferenceVars extends AbstractReference implements WordReference, Reference, Cloneable, Comparable<WordReferenceVars>, Comparator<WordReferenceVars> {
/**
* object for termination of concurrent blocking queue processing
*/
public static final WordReferenceVars poison = new WordReferenceVars();
protected static final byte[] default_language = UTF8.getBytes("en");
/**
* object for termination of concurrent blocking queue processing
*/
public static final WordReferenceVars poison = new WordReferenceVars();
protected static final byte[] default_language = UTF8.getBytes("en");
private final Bitfield flags;
private long lastModified;
@ -71,31 +69,6 @@ public class WordReferenceVars extends AbstractReference implements WordReferenc
private double termFrequency;
private final boolean local;
public WordReferenceVars(final URIMetadataNode md, final boolean local) {
this.language = md.language();
this.flags = md.flags();
this.lastModified = md.moddate().getTime();
this.urlHash = md.hash();
this.type = md.doctype();
this.llocal = md.llocal();
this.lother = md.lother();
this.positions = new LinkedBlockingQueue<Integer>();
this.positions.add(1);
String urlNormalform = md.url().toNormalform(true);
this.urlcomps = MultiProtocolURL.urlComps(urlNormalform).length;
this.urllength = urlNormalform.length();
this.virtualAge = -1; // compute that later
// the following fields cannot be computed here very easy and are just filled with dummy values
this.phrasesintext = 1;
this.hitcount = 1;
this.posinphrase = 1;
this.posofphrase = 1;
this.wordsintext = 1;
this.wordsintitle = 1;
this.termFrequency = 1;
this.local = local;
}
public WordReferenceVars(
final byte[] urlHash,
final int urlLength, // byte-length of complete URL

@ -790,6 +790,16 @@ public final class FileUtils {
}
}
/**
* Creates a temp file in the default system tmp directory (System property ""java.io.tmpdir"")
* with a name constructed by combination of class name and name.
* Marks the file with deleteOnExit() to be at least deleted on shutdown of jvm
*
* @param classObj name is used as prefix
* @param name
* @return temp file
* @throws IOException
*/
public static final File createTempFile(final Class<?> classObj, final String name) throws IOException {
String parserClassName = classObj.getName();
int idx = parserClassName.lastIndexOf('.');
@ -809,6 +819,7 @@ public final class FileUtils {
File.createTempFile(
parserClassName + "_" + ((idx > -1) ? fileName.substring(0, idx) : fileName),
(!fileExt.isEmpty()) ? "." + fileExt : fileExt);
tempFile.deleteOnExit();
return tempFile;
}

@ -3205,7 +3205,7 @@ public final class Switchboard extends serverSwitch {
//final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString(post.get("bookmarkFolder","/crawlStart")));
final Set<String> tags=ListManager.string2set(BookmarkHelper.cleanTagsString("/crawlStart"));
tags.add("crawlStart");
final String[] keywords = scraper.dc_subject();
final Set<String> keywords = scraper.dc_subject();
if (keywords != null) {
for (final String k: keywords) {
final String kk = BookmarkHelper.cleanTagsString(k);

@ -56,6 +56,7 @@ import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.ResponseHeader;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ByteBuffer;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.LookAheadIterator;
import net.yacy.cora.util.SpaceExceededException;
@ -686,10 +687,10 @@ public class Segment {
// create a word prototype which is re-used for all entries
if ((this.termIndex != null && storeToRWI) || searchEvent != null) {
final int len = (document == null) ? urlLength : document.dc_title().length();
final int wordsintitle = CommonPattern.SPACES.split(dc_title).length; // same calculation as for CollectionSchema.title_words_val
final WordReferenceRow ientry = new WordReferenceRow(
url.hash(),
urlLength, urlComps, len,
urlLength, urlComps, wordsintitle,
condenser.RESULT_NUMB_WORDS,
condenser.RESULT_NUMB_SENTENCES,
modDate.getTime(),

Loading…
Cancel
Save