Michael Peter Christen 9 years ago
commit 079112358c

@ -48,20 +48,20 @@
<classpathentry kind="lib" path="lib/commons-io-2.5.jar"/>
<classpathentry kind="lib" path="lib/slf4j-api-1.7.21.jar"/>
<classpathentry kind="lib" path="lib/chardet.jar"/>
<classpathentry kind="lib" path="lib/jetty-client-9.2.17.v20160517.jar"/>
<classpathentry kind="lib" path="lib/jetty-continuation-9.2.17.v20160517.jar"/>
<classpathentry kind="lib" path="lib/jetty-deploy-9.2.17.v20160517.jar"/>
<classpathentry kind="lib" path="lib/jetty-http-9.2.17.v20160517.jar"/>
<classpathentry kind="lib" path="lib/jetty-io-9.2.17.v20160517.jar"/>
<classpathentry kind="lib" path="lib/jetty-jmx-9.2.17.v20160517.jar"/>
<classpathentry kind="lib" path="lib/jetty-proxy-9.2.17.v20160517.jar"/>
<classpathentry kind="lib" path="lib/jetty-security-9.2.17.v20160517.jar"/>
<classpathentry kind="lib" path="lib/jetty-server-9.2.17.v20160517.jar"/>
<classpathentry kind="lib" path="lib/jetty-servlet-9.2.17.v20160517.jar"/>
<classpathentry kind="lib" path="lib/jetty-servlets-9.2.17.v20160517.jar"/>
<classpathentry kind="lib" path="lib/jetty-util-9.2.17.v20160517.jar"/>
<classpathentry kind="lib" path="lib/jetty-webapp-9.2.17.v20160517.jar"/>
<classpathentry kind="lib" path="lib/jetty-xml-9.2.17.v20160517.jar"/>
<classpathentry kind="lib" path="lib/jetty-client-9.2.18.v20160721.jar"/>
<classpathentry kind="lib" path="lib/jetty-continuation-9.2.18.v20160721.jar"/>
<classpathentry kind="lib" path="lib/jetty-deploy-9.2.18.v20160721.jar"/>
<classpathentry kind="lib" path="lib/jetty-http-9.2.18.v20160721.jar"/>
<classpathentry kind="lib" path="lib/jetty-io-9.2.18.v20160721.jar"/>
<classpathentry kind="lib" path="lib/jetty-jmx-9.2.18.v20160721.jar"/>
<classpathentry kind="lib" path="lib/jetty-proxy-9.2.18.v20160721.jar"/>
<classpathentry kind="lib" path="lib/jetty-security-9.2.18.v20160721.jar"/>
<classpathentry kind="lib" path="lib/jetty-server-9.2.18.v20160721.jar"/>
<classpathentry kind="lib" path="lib/jetty-servlet-9.2.18.v20160721.jar"/>
<classpathentry kind="lib" path="lib/jetty-servlets-9.2.18.v20160721.jar"/>
<classpathentry kind="lib" path="lib/jetty-util-9.2.18.v20160721.jar"/>
<classpathentry kind="lib" path="lib/jetty-webapp-9.2.18.v20160721.jar"/>
<classpathentry kind="lib" path="lib/jetty-xml-9.2.18.v20160721.jar"/>
<classpathentry kind="lib" path="lib/httpclient-4.5.2.jar"/>
<classpathentry kind="lib" path="lib/httpmime-4.5.2.jar"/>
<classpathentry kind="lib" path="lib/noggit-0.6.jar"/>

@ -190,20 +190,20 @@
<pathelement location="${lib}/javax.servlet-api-3.1.0.jar" />
<pathelement location="${lib}/jcifs-1.3.17.jar" />
<pathelement location="${lib}/jcl-over-slf4j-1.7.21.jar" />
<pathelement location="${lib}/jetty-client-9.2.17.v20160517.jar" />
<pathelement location="${lib}/jetty-continuation-9.2.17.v20160517.jar" />
<pathelement location="${lib}/jetty-deploy-9.2.17.v20160517.jar" />
<pathelement location="${lib}/jetty-http-9.2.17.v20160517.jar" />
<pathelement location="${lib}/jetty-io-9.2.17.v20160517.jar" />
<pathelement location="${lib}/jetty-jmx-9.2.17.v20160517.jar" />
<pathelement location="${lib}/jetty-proxy-9.2.17.v20160517.jar" />
<pathelement location="${lib}/jetty-security-9.2.17.v20160517.jar" />
<pathelement location="${lib}/jetty-server-9.2.17.v20160517.jar" />
<pathelement location="${lib}/jetty-servlet-9.2.17.v20160517.jar" />
<pathelement location="${lib}/jetty-servlets-9.2.17.v20160517.jar" />
<pathelement location="${lib}/jetty-util-9.2.17.v20160517.jar" />
<pathelement location="${lib}/jetty-webapp-9.2.17.v20160517.jar" />
<pathelement location="${lib}/jetty-xml-9.2.17.v20160517.jar" />
<pathelement location="${lib}/jetty-client-9.2.18.v20160721.jar" />
<pathelement location="${lib}/jetty-continuation-9.2.18.v20160721.jar" />
<pathelement location="${lib}/jetty-deploy-9.2.18.v20160721.jar" />
<pathelement location="${lib}/jetty-http-9.2.18.v20160721.jar" />
<pathelement location="${lib}/jetty-io-9.2.18.v20160721.jar" />
<pathelement location="${lib}/jetty-jmx-9.2.18.v20160721.jar" />
<pathelement location="${lib}/jetty-proxy-9.2.18.v20160721.jar" />
<pathelement location="${lib}/jetty-security-9.2.18.v20160721.jar" />
<pathelement location="${lib}/jetty-server-9.2.18.v20160721.jar" />
<pathelement location="${lib}/jetty-servlet-9.2.18.v20160721.jar" />
<pathelement location="${lib}/jetty-servlets-9.2.18.v20160721.jar" />
<pathelement location="${lib}/jetty-util-9.2.18.v20160721.jar" />
<pathelement location="${lib}/jetty-webapp-9.2.18.v20160721.jar" />
<pathelement location="${lib}/jetty-xml-9.2.18.v20160721.jar" />
<pathelement location="${lib}/jsch-0.1.53.jar" />
<pathelement location="${lib}/json-simple-1.1.1.jar" />
<pathelement location="${lib}/jsoup-1.9.2.jar" />

@ -146,7 +146,7 @@
</pre></fieldset>
This would look like:
<iframe name="target"
src="http://#[myPreviewAddress]#/index.html?display=2&amp;resource=local&amp;focus=0"
src="#[myPreviewProtocol]#://#[myPreviewAddress]#/index.html?display=2&amp;resource=local&amp;focus=0"
width="100%"
height="410"
frameborder="0"
@ -169,7 +169,7 @@
</pre></fieldset>
This would look like:
<iframe name="target2"
src="http://#[myPreviewAddress]#/yacysearch.html?display=2&amp;resource=local&amp;focus=0"
src="#[myPreviewProtocol]#://#[myPreviewAddress]#/yacysearch.html?display=2&amp;resource=local&amp;focus=0"
width="100%"
height="180"
frameborder="0"
@ -193,7 +193,7 @@
</pre></fieldset>
This would look like:
<iframe name="target3"
src="http://#[myPreviewAddress]#/yacyinteractive.html?display=2&amp;focus=0"
src="#[myPreviewProtocol]#://#[myPreviewAddress]#/yacyinteractive.html?display=2&amp;focus=0"
width="100%"
height="180"
frameborder="0"

@ -33,6 +33,8 @@ import java.net.MalformedURLException;
import java.util.Properties;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.data.WorkTables;
@ -224,19 +226,30 @@ public class ConfigPortal {
prop.put("target_selected_special_searchresult", "searchresult".equals(target_special) ? 1 : 0);
prop.put("target_special_pattern", sb.getConfig(SwitchboardConstants.SEARCH_TARGET_SPECIAL_PATTERN, ""));
/* Addresse used in code template */
/* Address used in code template */
String myaddress = (sb.peers == null) || sb.peers.mySeed() == null || sb.peers.mySeed().getIP() == null ? null : sb.peers.mySeed().getPublicAddress(sb.peers.mySeed().getIP());
if (myaddress == null) {
myaddress = "localhost:" + sb.getLocalPort();
}
prop.put("myaddress", myaddress);
/* Adress used to display iframe preview : no need to use public adress when coming from local */
String myPreviewAddress = myaddress;
if(header.accessFromLocalhost()) {
myPreviewAddress = "localhost:" + sb.getLocalPort();
/* Address used to generate the preview frames : let's use the adress and port as requested. (Same behavior as opensearchdescription.java) */
String myPreviewAddress = header.get(HeaderFramework.HOST); // returns host:port (if not default http/https ports)
String myPreviewProtocol = "http";
if (myPreviewAddress == null) {
myPreviewAddress = Domains.LOCALHOST + ":" + sb.getConfig("port", "8090");
} else {
final String sslport = ":" + sb.getConfig("port.ssl", "8443");
if (myPreviewAddress.endsWith(sslport)) { // connection on ssl port, use https protocol
myPreviewProtocol = "https";
}
}
/* YaCyDefaultServelt should have filled this custom header, making sure we know here wether original request is http or https
* (when default ports (80 and 443) are used, there is no way to distinguish the two schemes relying only on the Host header) */
myPreviewProtocol = header.get(HeaderFramework.X_YACY_REQUEST_SCHEME, myPreviewProtocol);
prop.put("myPreviewAddress", myPreviewAddress);
prop.put("myPreviewProtocol", myPreviewProtocol);
return prop;
}

@ -38,7 +38,7 @@ public class Translator_p {
final servletProperties prop = new servletProperties();
final Switchboard sb = (Switchboard) env;
String langcfg = env.getConfig("locale.language", "default");
String langcfg = sb.getConfig("locale.language", "default");
prop.put("targetlang", langcfg);
if ("default".equals(langcfg)) {
prop.put("errmsg", "activate a different language");
@ -47,10 +47,10 @@ public class Translator_p {
prop.put("errmsg", "");
}
File lngfile = new File("locales", langcfg + ".lng");
File lngfile = new File(sb.getAppPath("locale.source", "locales"), langcfg + ".lng");
CreateTranslationMasters ctm = new CreateTranslationMasters(/*new File ("locales","master.lng.xlf")*/);
File masterxlf = new File("locales", "master.lng.xlf");
File masterxlf = new File(sb.getAppPath("locale.source", "locales"), "master.lng.xlf");
if (!masterxlf.exists()) ctm.createMasterTranslationLists(masterxlf);
Map<String, Map<String, String>> origTrans = ctm.joinMasterTranslationLists(masterxlf, lngfile);
final File locallngfile = ctm.getScratchFile(lngfile);

@ -31,7 +31,7 @@
<tr><th>Item</th><th>URL</th><th>Success</th><th>Message</th></tr>
<tr>
<td>#[item]#</td>
<td><a href="#[url]#">#[url]#</a></td>
<td>#(success)#::<a href="#[url]#">#[url]#</a>#(/success)#</td>
<td>#(success)#fail::ok#(/success)#</td>
<td>#(success)##[message]#::<a href="#[message]#" target="_blank">#[message]#</a>#(/success)#</td>
</tr>

@ -22,7 +22,6 @@ import java.io.File;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.nio.file.Files;
import java.nio.file.Path;
import java.nio.file.StandardCopyOption;
import net.yacy.yacy;
@ -30,7 +29,6 @@ import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.order.Base64Order;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.search.Switchboard;
import net.yacy.search.index.Fulltext;
import net.yacy.server.serverObjects;
import net.yacy.server.serverSwitch;
@ -63,12 +61,19 @@ public class share {
// push mode: this does a document upload
prop.put("mode", 1);
prop.put("success", 0);
prop.put("mode_success", 0);
if (post == null) return prop;
// check file name
String filename = post.get("data", "");
if (!filename.startsWith(Fulltext.yacy_dump_prefix) || !filename.endsWith(".xml.gz")) return prop;
if (filename.isEmpty()) {
prop.put("mode_success_message", "file name is empty");
return prop;
}
if (!filename.startsWith(Fulltext.yacy_dump_prefix) || !filename.endsWith(".xml.gz")) {
prop.put("mode_success_message", "no index dump file (" + Fulltext.yacy_dump_prefix + "*.xml.gz)");
return prop;
}
// check data
String dataString = post.get("data$file", "");
@ -99,7 +104,7 @@ public class share {
return prop;
}
prop.put("success", 1);
prop.put("mode_success", 1);
return prop;
}

@ -43,19 +43,19 @@ public class compare_yacy {
searchengines.put("YaCy (local)", "yacysearch.html?display=2&resource=local&query=");
//searchengines.put("google.com", "https://www.google.com/#q=");
searchengines.put("startpage.com", "https://startpage.com/do/search?cat=web&query=");
searchengines.put("bing.com", "http://www.bing.com/search?q=");
searchengines.put("metager.de", "http://www.metager.de/meta/cgi-bin/meta.ger1?eingabe=");
searchengines.put("metager2.de (web)", "http://www.metager2.de/search.php?ses=web&q=");
searchengines.put("metager2.de (international)", "http://www.metager2.de/search.php?ses=international&q=");
searchengines.put("yahoo.com", "http://search.yahoo.com/search?p=");
//searchengines.put("romso.de", "http://romso.de/?q="); // no serach service 2016-01-02
searchengines.put("Wikipedia English", "http://en.wikipedia.org/wiki/");
searchengines.put("Wikipedia Deutsch", "http://de.wikipedia.org/wiki/");
searchengines.put("Sciencenet", "http://sciencenet.fzk.de:8080/yacysearch.html?verify=true&resource=global&nav=all&display=2&meanCount=5&query=");
searchengines.put("bing.com", "https://www.bing.com/search?q=");
searchengines.put("metager.de", "https://www.metager.de/meta/cgi-bin/meta.ger1?eingabe=");
searchengines.put("metager2.de (web)", "https://www.metager2.de/search.php?ses=web&q=");
searchengines.put("metager2.de (international)", "https://www.metager2.de/search.php?ses=international&q=");
//searchengines.put("yahoo.com", "https://search.yahoo.com/search?p="); // no search service in iframe 2016-08-17 : "Load denied by X-Frame-Options: does not permit cross-origin framing."
//searchengines.put("romso.de", "http://romso.de/?q="); // no search service 2016-01-02
searchengines.put("Wikipedia English", "https://en.wikipedia.org/wiki/");
searchengines.put("Wikipedia Deutsch", "https://de.wikipedia.org/wiki/");
//searchengines.put("Sciencenet", "http://sciencenet.fzk.de:8080/yacysearch.html?verify=true&resource=global&nav=all&display=2&meanCount=5&query="); // no search service 2016-08-17
//searchengines.put("dbpedia", "http://dbpedia.neofonie.de/browse/~:"); // no search service 2016-01-02
searchengines.put("wolfram alpha", "http://www.wolframalpha.com/input/?i=");
searchengines.put("OAIster@OCLC", "http://oaister.worldcat.org/search?q=");
searchengines.put("oai.yacy.net", "http://oai.yacy.net/yacysearch.html?verify=true&resource=local&nav=all&display=2&meanCount=5&query=");
searchengines.put("wolfram alpha", "https://www.wolframalpha.com/input/?i=");
searchengines.put("OAIster@OCLC", "https://oaister.worldcat.org/search?q=");
//searchengines.put("oai.yacy.net", "http://oai.yacy.net/yacysearch.html?verify=true&resource=local&nav=all&display=2&meanCount=5&query="); // no search service 2016-08-17
}
public static serverObjects respond(final RequestHeader header, final serverObjects post, final serverSwitch env) {

@ -25,6 +25,7 @@
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
import net.yacy.cora.protocol.Domains;
import net.yacy.cora.protocol.HeaderFramework;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
@ -39,7 +40,7 @@ public class opensearchdescription {
String promoteSearchPageGreeting = env.getConfig(SwitchboardConstants.GREETING, "");
if (env.getConfigBool(SwitchboardConstants.GREETING_NETWORK_NAME, false)) promoteSearchPageGreeting = env.getConfig("network.unit.description", "");
String thisaddress = header.get("Host"); // returns host:port (if not defalt http/https ports)
String thisaddress = header.get(HeaderFramework.HOST); // returns host:port (if not default http/https ports)
String thisprotocol = "http";
if (thisaddress == null) {
thisaddress = Domains.LOCALHOST + ":" + sb.getConfig("port", "8090");
@ -49,6 +50,9 @@ public class opensearchdescription {
thisprotocol = "https";
}
}
/* YaCyDefaultServelt should have filled this custom header, making sure we know here wether original request is http or https
* (when default ports (80 and 443) are used, there is no way to distinguish the two schemes relying only on the Host header) */
thisprotocol = header.get(HeaderFramework.X_YACY_REQUEST_SCHEME, thisprotocol);
final serverObjects prop = new serverObjects();
prop.put("compareyacy", post != null && post.getBoolean("compare_yacy") ? 1 : 0);

@ -2,7 +2,7 @@
<OpenSearchDescription
xmlns="http://a9.com/-/spec/opensearch/1.1/"
xmlns:suggestions="http://www.opensearch.org/specifications/opensearch/extensions/suggestions/1.1">
<ShortName>#(compareyacy)#::Compare #(/compareyacy)#YaCy/#[clientname]#</ShortName>
<ShortName>#(compareyacy)#::Compare #(/compareyacy)#YaCy Search on '#[clientname]#'</ShortName>
<LongName>YaCy.net - #[SearchPageGreeting]#</LongName>
<Image type="image/gif">#[thisprotocol]#://#[thisaddress]#/env/grafics/yacy.png</Image>
<Image width="16" height="16">
@ -17,7 +17,7 @@
<Url type="application/rss+xml" method="GET" template="#[thisprotocol]#://#[thisaddress]#/yacysearch.rss?nav=&amp;query={searchTerms}&amp;startRecord={startIndex?}&amp;maximumRecords={count?}&amp;nav=all&amp;resource=global" />
<Url type="application/atom+xml" method="GET" template="#[thisprotocol]#://#[thisaddress]#/yacysearch.atom?query={searchTerms}&amp;startRecord={startIndex?}&amp;maximumRecords={count?}&amp;resource=global" />
::
<Url type="text/html" method="GET" template="http://#[thisaddress]#/compare_yacy.html?query={searchTerms}&amp;left=#[search_left]#&amp;right=#[search_right]#&amp;display=2" />
<Url type="text/html" method="GET" template="#[thisprotocol]#://#[thisaddress]#/compare_yacy.html?query={searchTerms}&amp;left=#[search_left]#&amp;right=#[search_right]#&amp;display=2" />
#(/compareyacy)#
<Url type="application/x-suggestions+json" template="#[thisprotocol]#://#[thisaddress]#/suggest.json?query={searchTerms}"/>
<Url type="application/x-suggestions+xml" template="#[thisprotocol]#://#[thisaddress]#/suggest.xml?query={searchTerms}"/>

@ -36,7 +36,7 @@
<!-- the Solr version used in dependency section for all related dependencies -->
<solr.version>5.5.2</solr.version>
<!-- the Jetty version used in dependency section for all related dependencies -->
<jetty.version>9.2.17.v20160517</jetty.version>
<jetty.version>9.2.18.v20160721</jetty.version>
<!-- properties used for filtering yacyBuildProperties.java -->
<REPL_DATE>${DSTAMP}</REPL_DATE>

@ -113,6 +113,8 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
public static final String X_YACY_ORIGINAL_REQUEST_LINE = "X-Original-Request-Line";
public static final String X_YACY_MEDIA_TITLE = "X-YaCy-Media-Title"; // can be attached to media files which do not have metadata; this will be used as title
public static final String X_YACY_MEDIA_KEYWORDS = "X-YaCy-Media-Keywords"; // can be attached to media files which do not have metadata; this will be used as keywords (space-separared list of words)
/** Added when generating legacy request header to allow template servlets to know the original request scheme : "http" or "https" */
public static final String X_YACY_REQUEST_SCHEME = "X-YaCy-Request-Scheme";
public static final String SET_COOKIE = "Set-Cookie";
public static final String SET_COOKIE2 = "Set-Cookie2";

@ -39,7 +39,8 @@ public abstract class AbstractParser implements Parser {
protected final Set<String> SUPPORTED_MIME_TYPES = new LinkedHashSet<String>();
protected final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
private final String name;
protected Object scraperObject; // used scraper or source object if any, otherwise null
/**
* initialize a parser with a name
* @param name

@ -92,13 +92,13 @@ public class Document {
private final Set<String> languages;
private boolean indexingDenied;
private final double lon, lat;
private final Object parserObject; // the source object that was used to create the Document
private final Parser parserObject; // the source object that was used to create the Document
private final Map<String, Set<String>> generic_facets; // a map from vocabulary names to the set of tags for that vocabulary which apply for this document
private final Date lastModified;
private int crawldepth;
public Document(final DigestURL location, final String mimeType, final String charset,
final Object parserObject,
final Parser parserObject,
final Set<String> languages,
final String[] keywords,
final List<String> titles,
@ -160,11 +160,29 @@ public class Document {
if (contentDomain != ContentDomain.ALL) return contentDomain;
return this.dc_source().getContentDomainFromExt();
}
public Object getParserObject() {
/**
* The parser used to generate the document
* @return Parser
*/
public Parser getParserObject() {
return this.parserObject;
}
/**
* Confinient call to get the source/scraper object of the underlaying parser
* if the parser uses a scraper, like htmlParser
* @return scraper object typically of type ContentScraper but may also of type DCEntry
*/
public Object getScraperObject() {
if (this.parserObject instanceof AbstractParser) {
if (((AbstractParser) this.parserObject).scraperObject != null) {
return ((AbstractParser) this.parserObject).scraperObject;
}
}
return null;
}
public Set<String> getContentLanguages() {
return this.languages;
}
@ -931,9 +949,9 @@ dc_rights
// clean up parser data
for (final Document doc: docs) {
Object parserObject = doc.getParserObject();
if (parserObject instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) parserObject;
Object scraper = doc.getScraperObject();
if (scraper instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) scraper;
html.close();
}
}
@ -979,9 +997,9 @@ dc_rights
if (!entry.getKey().attachedNofollow()) result.put(entry.getKey(), entry.getValue());
}
}
final Object parser = d.getParserObject();
if (parser instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) parser;
final Object scraper = d.getScraperObject();
if (scraper instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) scraper;
String refresh = html.getRefreshPath();
if (refresh != null && refresh.length() > 0) try {result.put(new AnchorURL(refresh), "refresh");} catch (final MalformedURLException e) {}
AnchorURL canonical = html.getCanonical();

@ -45,6 +45,8 @@ import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.Document;
import net.yacy.document.parser.genericParser;
import net.yacy.search.schema.CollectionSchema;
public class DCEntry extends MultiMapSolrParams {
@ -330,11 +332,15 @@ public class DCEntry extends MultiMapSolrParams {
languages.add(getLanguage());
List<String> t = new ArrayList<String>(1);
t.add(getTitle());
// for processing during indexing, embed entry as source scraperObject in a standard parserobj object
genericParser parserobj = new genericParser(this); // init the simplest parser with DCEntry as source/scraperObject used during indexing
return new Document(
getIdentifier(true),
"text/html",
StandardCharsets.UTF_8.name(),
this,
parserobj,
languages,
getSubject(), // might be null
t,
@ -343,7 +349,7 @@ public class DCEntry extends MultiMapSolrParams {
null,
getDescriptions(),
getLon(), getLat(),
get("text_t", ""),
get(CollectionSchema.text_t.name(), ""),
null,
null,
null,

@ -44,6 +44,17 @@ public class genericParser extends AbstractParser implements Parser {
// this parser is used if no other fits. This parser fits all
}
/**
* Constructor to allow to set a scraperObject
* because it is desired to keep the scraper/source object protected
* This is used for surrogate parsers to set a other source/scraper then ContentScraper
* @param scraper
*/
public genericParser(Object scraper) {
super("Generic Parser");
this.scraperObject = scraper;
}
@Override
public Document[] parse(
final DigestURL location,

@ -34,9 +34,7 @@ import java.nio.charset.Charset;
import java.nio.charset.IllegalCharsetNameException;
import java.nio.charset.StandardCharsets;
import java.nio.charset.UnsupportedCharsetException;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.Set;
import net.yacy.cora.document.encoding.UTF8;
import net.yacy.cora.document.id.DigestURL;
@ -60,21 +58,29 @@ public class htmlParser extends AbstractParser implements Parser {
private static final int maxLinks = 10000;
public final static String[] htmlExtensions = new String[]{
"htm","html","shtml","shtm","stm","xhtml","phtml","phtm",
"tpl","php","php2","php3","php4","php5","cfm","asp","aspx","tex","txt","msg"
};
public final static Set<String> htmlExtensionsSet;
static {
htmlExtensionsSet = new HashSet<>(htmlExtensions.length);
for (String ext: htmlExtensions) htmlExtensionsSet.add(ext);
}
public htmlParser() {
super("Streaming HTML Parser");
this.SUPPORTED_EXTENSIONS.addAll(htmlExtensionsSet);
this.SUPPORTED_EXTENSIONS.add("htm");
this.SUPPORTED_EXTENSIONS.add("html");
this.SUPPORTED_EXTENSIONS.add("shtml");
this.SUPPORTED_EXTENSIONS.add("shtm");
this.SUPPORTED_EXTENSIONS.add("stm");
this.SUPPORTED_EXTENSIONS.add("xhtml");
this.SUPPORTED_EXTENSIONS.add("phtml");
this.SUPPORTED_EXTENSIONS.add("phtm");
this.SUPPORTED_EXTENSIONS.add("tpl");
this.SUPPORTED_EXTENSIONS.add("php");
this.SUPPORTED_EXTENSIONS.add("php2");
this.SUPPORTED_EXTENSIONS.add("php3");
this.SUPPORTED_EXTENSIONS.add("php4");
this.SUPPORTED_EXTENSIONS.add("php5");
this.SUPPORTED_EXTENSIONS.add("cfm");
this.SUPPORTED_EXTENSIONS.add("asp");
this.SUPPORTED_EXTENSIONS.add("aspx");
this.SUPPORTED_EXTENSIONS.add("tex");
this.SUPPORTED_EXTENSIONS.add("txt");
this.SUPPORTED_EXTENSIONS.add("msg");
this.SUPPORTED_MIME_TYPES.add("text/html");
this.SUPPORTED_MIME_TYPES.add("text/xhtml+xml");
this.SUPPORTED_MIME_TYPES.add("application/xhtml+xml");
@ -97,7 +103,8 @@ public class htmlParser extends AbstractParser implements Parser {
try {
// first get a document from the parsed html
Charset[] detectedcharsetcontainer = new Charset[]{null};
final ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks);
scraperObject = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks);
ContentScraper scraper = (ContentScraper)scraperObject; // shortcut to access ContentScraper methodes
// parseToScraper also detects/corrects/sets charset from html content tag
final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
Document documentSnapshot = null;
@ -130,7 +137,7 @@ public class htmlParser extends AbstractParser implements Parser {
* @param scraper
* @return
*/
private static Document transformScraper(final DigestURL location, final String mimeType, final String charSet, final ContentScraper scraper) {
private Document transformScraper(final DigestURL location, final String mimeType, final String charSet, final ContentScraper scraper) {
final String[] sections = new String[
scraper.getHeadlines(1).length +
scraper.getHeadlines(2).length +
@ -150,7 +157,7 @@ public class htmlParser extends AbstractParser implements Parser {
location,
mimeType,
charSet,
scraper,
this,
scraper.getContentLanguages(),
scraper.getKeywords(),
scraper.getTitles(),
@ -178,7 +185,7 @@ public class htmlParser extends AbstractParser implements Parser {
} catch (UnsupportedEncodingException e) {
sourceStream = new ByteArrayInputStream(UTF8.getBytes(input));
}
ContentScraper scraper;
ContentScraper scraper; // for this static methode no need to init local this.scraperObject
try {
scraper = parseToScraper(location, documentCharset, vocabularyScraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks);
} catch (Failure e) {
@ -242,6 +249,7 @@ public class htmlParser extends AbstractParser implements Parser {
}
// parsing the content
// for this static methode no need to init local this.scraperObject here
final ContentScraper scraper = new ContentScraper(location, maxLinks, vocabularyScraper, timezoneOffset);
final TransformerWriter writer = new TransformerWriter(null,null,scraper,null,false, Math.max(64, Math.min(4096, sourceStream.available())));
try {

@ -68,10 +68,9 @@ public class swfParser extends AbstractParser implements Parser {
try {
final SWF2HTML swf2html = new SWF2HTML();
String contents = "";
ContentScraper htmlscraper=null;
try {
contents = swf2html.convertSWFToHTML(source);
htmlscraper = htmlParser.parseToScraper(location, charset, scraper, timezoneOffset, contents, 100);
contents = swf2html.convertSWFToHTML(source);
scraperObject = htmlParser.parseToScraper(location, charset, scraper, timezoneOffset, contents, 100);
} catch (final NegativeArraySizeException e) {
throw new Parser.Failure(e.getMessage(), location);
} catch (final IOException e) {
@ -79,29 +78,9 @@ public class swfParser extends AbstractParser implements Parser {
} catch (final Exception e) {
throw new Parser.Failure(e.getMessage(), location);
}
/*
String url = null;
String urlnr = null;
final String linebreak = System.getProperty("line.separator");
final List<AnchorURL> anchors = new ArrayList<AnchorURL>();
int urls = 0;
int urlStart = -1;
int urlEnd = 0;
int p0 = 0;
//extracting urls
while ((urlStart = contents.indexOf("http://",urlEnd)) >= 0){
urlEnd = contents.indexOf(linebreak,urlStart);
url = contents.substring(urlStart,urlEnd);
urlnr = Integer.toString(++urls);
AnchorURL u = new AnchorURL(url);
u.setNameProperty(urlnr);
anchors.add(u);
contents = contents.substring(0,urlStart)+contents.substring(urlEnd);
}
*/
// As the result of parsing this function must return a plasmaParserDocument object
// As the result of parsing this function must return a plasmaParserDocument object
ContentScraper htmlscraper = (ContentScraper) this.scraperObject; // shortcut to access ContentScraper methodes
return new Document[]{new Document(
location, // url of the source document
mimeType, // the documents mime type

@ -27,23 +27,22 @@
package net.yacy.document.parser;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.Date;
import java.util.ArrayList;
import java.util.List;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.document.id.MultiProtocolURL;
import net.yacy.cora.util.CommonPattern;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;
import org.apache.poi.hssf.eventusermodel.HSSFEventFactory;
import org.apache.poi.hssf.eventusermodel.HSSFListener;
import org.apache.poi.hssf.eventusermodel.HSSFRequest;
import org.apache.poi.hssf.record.NumberRecord;
import org.apache.poi.hssf.record.Record;
import org.apache.poi.hssf.record.SSTRecord;
import org.apache.poi.hpsf.SummaryInformation;
import org.apache.poi.hssf.extractor.ExcelExtractor;
import org.apache.poi.poifs.filesystem.POIFSFileSystem;
@ -76,113 +75,52 @@ public class xlsParser extends AbstractParser implements Parser {
final int timezoneOffset,
final InputStream source) throws Parser.Failure,
InterruptedException {
return new XLSHSSFListener().parse(location, mimeType, charset, source);
}
public class XLSHSSFListener implements HSSFListener {
//StringBuilder for parsed text
private final StringBuilder sbFoundStrings;
public XLSHSSFListener() {
this.sbFoundStrings = new StringBuilder(100);
}
/*
* parses the source documents and returns a Document containing
* all extracted information about the parsed document
*/
public Document[] parse(final DigestURL location, final String mimeType,
@SuppressWarnings("unused") final String charset, final InputStream source) throws Parser.Failure,
InterruptedException {
try {
//create a new org.apache.poi.poifs.filesystem.Filesystem
final POIFSFileSystem poifs = new POIFSFileSystem(source);
//get the Workbook (excel part) stream in a InputStream
final InputStream din = poifs.createDocumentInputStream("Workbook");
//construct out HSSFRequest object
final HSSFRequest req = new HSSFRequest();
//lazy listen for ALL records with the listener shown above
req.addListenerForAllRecords(this);
//create our event factory
final HSSFEventFactory factory = new HSSFEventFactory();
//process our events based on the document input stream
factory.processEvents(req, din);
//close our document input stream (don't want to leak these!)
din.close();
//now the parsed strings are in the StringBuilder, now convert them to a String
final String contents = this.sbFoundStrings.toString().trim();
/*
* create the plasmaParserDocument for the database
* and set shortText and bodyText properly
*/
return new Document[]{new Document(
location,
mimeType,
StandardCharsets.UTF_8.name(),
this,
null,
null,
singleList(location.getFile()),
null, // TODO: AUTHOR
"", // TODO: publisher
null,
null,
0.0d, 0.0d,
contents,
null,
null,
null,
false,
new Date())};
} catch (final Exception e) {
if (e instanceof InterruptedException) throw (InterruptedException) e;
/*
* an unexpected error occurred, log it and throw a Parser.Failure
*/
ConcurrentLog.logException(e);
final String errorMsg = "Unable to parse the xls document '" + location + "':" + e.getMessage();
throw new Parser.Failure(errorMsg, location);
}
}
@Override
public void processRecord(final Record record) {
SSTRecord sstrec = null;
switch (record.getSid()){
case NumberRecord.sid: {
final NumberRecord numrec = (NumberRecord) record;
this.sbFoundStrings.append(numrec.getValue());
break;
}
//unique string records
case SSTRecord.sid: {
sstrec = (SSTRecord) record;
for (int k = 0; k < sstrec.getNumUniqueStrings(); k++){
this.sbFoundStrings.append( sstrec.getString(k) );
//add line seperator
this.sbFoundStrings.append( "\n" );
}
break;
}
/*
case LabelSSTRecord.sid: {
final LabelSSTRecord lsrec = (LabelSSTRecord)record;
sbFoundStrings.append( sstrec.getString(lsrec.getSSTIndex()) );
break;
}
*/
}
//add line seperator
this.sbFoundStrings.append( "\n" );
try {
//create a new org.apache.poi.poifs.filesystem.Filesystem
final POIFSFileSystem poifs = new POIFSFileSystem(source);
ExcelExtractor exceldoc = new ExcelExtractor(poifs);
exceldoc.setIncludeSheetNames(false); // exclude sheet names from getText() as also empty sheet names are returned
SummaryInformation sumInfo = exceldoc.getSummaryInformation();
String title = sumInfo.getTitle();
if (title == null || title.isEmpty()) title = MultiProtocolURL.unescape(location.getFileName());
final String subject = sumInfo.getSubject();
List<String> descriptions = new ArrayList<String>();
if (subject != null && !subject.isEmpty()) descriptions.add(subject);
// get keywords (for yacy as array)
final String keywords = sumInfo.getKeywords();
final String[] keywlist;
if (keywords != null && !keywords.isEmpty()) {
keywlist = CommonPattern.COMMA.split(keywords);
} else keywlist = null;
Document[] retdocs = new Document[]{new Document(
location,
mimeType,
StandardCharsets.UTF_8.name(),
this,
null,
keywlist,
singleList(title),
sumInfo.getAuthor(),
exceldoc.getDocSummaryInformation().getCompany(),
null,
descriptions,
0.0d, 0.0d,
exceldoc.getText(),
null,
null,
null,
false,
sumInfo.getLastSaveDateTime())};
return retdocs;
} catch (IOException ex1) {
throw new Parser.Failure(ex1.getMessage(), location);
}
}
}

@ -686,6 +686,9 @@ public class YaCyDefaultServlet extends HttpServlet {
legacyRequestHeader.put(HeaderFramework.CONNECTION_PROP_CLIENTIP, request.getRemoteAddr());
legacyRequestHeader.put(HeaderFramework.CONNECTION_PROP_PATH, target);
legacyRequestHeader.put(HeaderFramework.CONNECTION_PROP_EXT, targetExt);
/* Add request scheme (http or https) to allow templates to know wether original request is http or https
* (when default ports (80 and 443) are used, there is no way to distinguish the two schemes relying only on the Host header) */
legacyRequestHeader.put(HeaderFramework.X_YACY_REQUEST_SCHEME, request.getScheme());
Switchboard sb = Switchboard.getSwitchboard();
if (legacyRequestHeader.containsKey(RequestHeader.AUTHORIZATION)) {
if (HttpServletRequest.BASIC_AUTH.equalsIgnoreCase(request.getAuthType())) {

@ -601,11 +601,14 @@ public class Segment {
crawlProfile != null && document.getDepth() <= crawlProfile.snapshotMaxdepth() &&
!crawlProfile.snapshotsMustnotmatch().matcher(urlNormalform).matches()) {
// load pdf in case that is wanted. This can later be used to compute a web page preview in the search results
String ext = MultiProtocolURL.getFileExtension(url.getFile()).toLowerCase();
if (ext.length() == 0 || url.getFile().length() <= 1 || htmlParser.htmlExtensionsSet.contains(ext)) {
Parser p = document.getParserObject();
boolean mimesupported = false;
if (p instanceof htmlParser)
mimesupported = ((htmlParser)p).supportedMimeTypes().contains(document.dc_format());
if (mimesupported)
// STORE IMAGE AND METADATA
Transactions.store(vector, true, crawlProfile.snapshotLoadImage(), crawlProfile.snapshotReplaceold(), proxy, acceptLanguage);
}
}
// STORE TO SOLR

@ -335,29 +335,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (allAttr || contains(CollectionSchema.audiolinkscount_i)) add(doc, CollectionSchema.audiolinkscount_i, md.laudio());
if (allAttr || contains(CollectionSchema.videolinkscount_i)) add(doc, CollectionSchema.videolinkscount_i, md.lvideo());
if (allAttr || contains(CollectionSchema.applinkscount_i)) add(doc, CollectionSchema.applinkscount_i, md.lapp());
if (allAttr || contains(CollectionSchema.text_t)) {
// construct the text from other metadata parts.
// This is necessary here since that is used to search the link when no other data (parsed text body) is available
StringBuilder sb = new StringBuilder(120);
// accText(sb, md.dc_title()); // default search field via getQueryFields(), not needed for snippet (always displayed)
// accText(sb, md.dc_creator()); // author is in Default ranking/getQueryFields
// accText(sb, md.dc_publisher()); // has it's own metadata field publisher_t (not part of default queryfields) and mostly N/A
// accText(sb, md.snippet()); // above added to description_txt, default search field via getQueryFields(), description_txt incl. in snippet calculation
accText(sb, md.url().toTokens());
// accText(sb, keywords); // default search field via getQueryFields(), keywords not incl. in snippet calculation
add(doc, CollectionSchema.text_t, sb.toString());
}
return doc;
}
private static void accText(final StringBuilder sb, String text) {
if (text == null || text.length() == 0) return;
if (sb.length() != 0) sb.append(' ');
text = text.trim();
if (!text.isEmpty() && text.charAt(text.length() - 1) == '.') sb.append(text); else sb.append(text).append('.');
}
public static class Subgraph {
public final ArrayList<String>[] urlProtocols, urlStubs, urlAnchorTexts;
@SuppressWarnings("unchecked")
@ -541,11 +522,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size());
int c = 0;
final Object parser = document.getParserObject();
final Object scraper = document.getScraperObject();
boolean containsCanonical = false;
DigestURL canonical = null;
if (parser instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) parser;
if (scraper instanceof ContentScraper) {
final ContentScraper html = (ContentScraper) scraper;
List<ImageEntry> images = html.getImages();
// header tags
@ -885,9 +866,9 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
}
if (parser instanceof DCEntry) {
if (scraper instanceof DCEntry) {
// the document was created with a surrogate parsing; overwrite all md: -entries to Solr
DCEntry dcentry = (DCEntry) parser;
DCEntry dcentry = (DCEntry) scraper;
for (Map.Entry<String, String[]> entry: dcentry.getMap().entrySet()) {
String tag = entry.getKey();
if (!tag.startsWith("md:") || tag.length() < 4) continue;

@ -39,13 +39,22 @@ import java.nio.channels.FileChannel;
import java.nio.channels.FileLock;
import java.util.Properties;
import java.util.concurrent.Semaphore;
import com.google.common.io.Files;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Digest;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.cora.protocol.ConnectionInfo;
import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.protocol.TimeoutRequest;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.sorting.Array;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.Document;
import net.yacy.gui.YaCyApp;
import net.yacy.gui.framework.Browser;
import net.yacy.http.Jetty9HttpServerImpl;
@ -54,17 +63,11 @@ import net.yacy.kelondro.util.FileUtils;
import net.yacy.kelondro.util.Formatter;
import net.yacy.kelondro.util.MemoryControl;
import net.yacy.kelondro.util.OS;
import net.yacy.peers.Seed;
import net.yacy.peers.operation.yacyBuildProperties;
import net.yacy.peers.operation.yacyRelease;
import net.yacy.search.Switchboard;
import net.yacy.search.SwitchboardConstants;
import com.google.common.io.Files;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.order.Digest;
import net.yacy.cora.protocol.ConnectionInfo;
import net.yacy.crawler.retrieval.Response;
import net.yacy.peers.Seed;
import net.yacy.server.serverSwitch;
import net.yacy.utils.translation.TranslatorXliff;
@ -265,22 +268,7 @@ public final class yacy {
//final File htTemplatePath = new File(homePath, sb.getConfig("htTemplatePath","htdocs"));
// copy the donate iframe (better to copy this once here instead of doing this in an actual iframe in the search result)
final File wwwEnvPath = new File(htDocsPath, "env");
mkdirIfNeseccary(wwwEnvPath);
final String iframesource = sb.getConfig("donation.iframesource", "");
final String iframetarget = sb.getConfig("donation.iframetarget", "");
final File iframefile = new File(htDocsPath, iframetarget);
if (!iframefile.exists()) new Thread() {
@Override
public void run() {
final ClientIdentification.Agent agent = ClientIdentification.getAgent(ClientIdentification.yacyInternetCrawlerAgentName);
Response response;
try {
response = sb.loader == null ? null : sb.loader.load(sb.loader.request(new DigestURL(iframesource), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, agent);
if (response != null) FileUtils.copy(response.getContent(), iframefile);
} catch (Throwable e) {}
}
}.start();
importDonationIFrame(sb, htDocsPath);
// create default notifier picture
File notifierFile = new File(htDocsPath, "notifier.gif");
@ -421,6 +409,59 @@ public final class yacy {
} catch (final Exception e) {} // was once stopped by de.anomic.net.ftpc$sm.checkExit(ftpc.java:1790)
}
/**
* Concurrently import the donation iframe content to serve it directly from this peer.
* @param switchBoard the SwitchBoard instance. Must not be null.
* @param htDocsDirectory the custom htdocs directory. Must not be null.
*/
private static void importDonationIFrame(final Switchboard switchBoard, final File htDocsDirectory) {
final File wwwEnvPath = new File(htDocsDirectory, "env");
mkdirIfNeseccary(wwwEnvPath);
final String iframesource = switchBoard.getConfig("donation.iframesource", "");
final String iframetarget = switchBoard.getConfig("donation.iframetarget", "");
final File iframefile = new File(htDocsDirectory, iframetarget);
if (!iframefile.exists()) new Thread() {
@Override
public void run() {
final ClientIdentification.Agent agent = ClientIdentification.getAgent(ClientIdentification.yacyInternetCrawlerAgentName);
Response documentResponse;
try {
/* Load the donation html frame content */
documentResponse = switchBoard.loader == null ? null : switchBoard.loader.load(switchBoard.loader.request(new DigestURL(iframesource), false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, agent);
if (documentResponse != null) {
Document[] documents = documentResponse.parse();
if(documents != null && documents.length > 0 && documents[0] != null) {
Document donateDocument = documents[0];
String donateDocContent = new String(documentResponse.getContent(), donateDocument.getCharset());
/* Load image resources contained in the page */
if(donateDocument.getImages() != null) {
for(DigestURL imgURL : donateDocument.getImages().keySet()) {
try {
Response response = switchBoard.loader.load(switchBoard.loader.request(imgURL, false, true), CacheStrategy.NOCACHE, Integer.MAX_VALUE, null, agent);
if (response != null) {
String imgFileName = imgURL.getFileName();
/* Store each image in the same directory as the iframe target file */
FileUtils.copy(response.getContent(), new File(iframefile.getParentFile(), imgFileName));
/* Transform the original image URL to a relative one */
donateDocContent = donateDocContent.replace(imgURL.getURL().toString(), imgFileName);
}
} catch(IOException e) {
/* Failing to load one image should not stop the whole task */
ConcurrentLog.warn("STARTUP", "Donation frame retrieval : could not get an image resource.", e);
}
}
}
FileUtils.copy(donateDocContent.getBytes(donateDocument.getCharset()), iframefile);
}
}
} catch (Exception e) {
ConcurrentLog.warn("STARTUP", "Could not retrieve donation frame content.", e);
}
}
}.start();
}
/**
* @param f
*/

@ -0,0 +1,57 @@
package net.yacy.document.parser;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStreamReader;
import java.io.Reader;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.VocabularyScraper;
import static org.hamcrest.CoreMatchers.containsString;
import org.junit.Test;
import static org.junit.Assert.*;
public class xlsParserTest {
/**
* Test of parse method, of class xlsParser.
*/
@Test
public void testParse() throws Exception {
final String[][] testFiles = new String[][]{
// meaning: filename in test/parsertest, mimetype, title, creator, description,
new String[]{"umlaute_linux.xls", "application/msexcel", "In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen", "", ""},
new String[]{"umlaute_mac.xls", "application/msexcel", "In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen", "", ""},
new String[]{"umlaute_windows.xls", "application/msexcel", "In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen", "afieg", ""}
};
for (final String[] testFile : testFiles) {
final String filename = "test/parsertest/" + testFile[0];
final File file = new File(filename);
final String mimetype = testFile[1];
final AnchorURL url = new AnchorURL("http://localhost/" + filename);
AbstractParser p = new xlsParser();
final Document[] docs = p.parse(url, mimetype, null, new VocabularyScraper(), 0, new FileInputStream(file));
for (final Document doc : docs) {
final Reader content = new InputStreamReader(doc.getTextStream(), doc.getCharset());
final StringBuilder str = new StringBuilder();
int c;
while ((c = content.read()) != -1) {
str.append((char) c);
}
System.out.println("Parsed " + filename + ": " + str);
assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
assertThat(doc.dc_creator(), containsString(testFile[3]));
if (testFile[4].length() > 0) {
assertThat(doc.dc_description()[0], containsString(testFile[4]));
}
}
}
}
}
Loading…
Cancel
Save