From 16d1d744faa4c1bf82403c852c273e70b24179a8 Mon Sep 17 00:00:00 2001 From: Michael Peter Christen Date: Tue, 25 Jun 2013 16:27:20 +0200 Subject: [PATCH] added url_file_name_s in default collection schema for the file name without the file extension. This part of the file path is removed from the multi-field url_paths_sxt, which has now not the file name as last part of the path list. The same applies to the new fields source_file_name_s and target_file_name_s in the webgraph schema. --- defaults/solr.collection.schema | 7 +++- defaults/solr.webgraph.schema | 10 ++++- htroot/ViewFile.java | 5 +-- htroot/yacysearchitem.java | 6 ++- .../yacy/cora/document/MultiProtocolURI.java | 26 ++++++------- .../document/analysis/Classification.java | 4 +- .../federate/solr/SchemaConfiguration.java | 18 ++++----- .../responsewriter/JsonResponseWriter.java | 9 ++--- source/net/yacy/crawler/data/Latency.java | 4 +- .../net/yacy/crawler/data/ResultImages.java | 2 +- .../yacy/crawler/retrieval/FileLoader.java | 3 +- .../net/yacy/crawler/retrieval/Response.java | 12 +++--- .../net/yacy/crawler/retrieval/SMBLoader.java | 2 +- .../net/yacy/data/ymark/YMarkAutoTagger.java | 3 +- source/net/yacy/document/Document.java | 4 +- source/net/yacy/document/LibraryProvider.java | 2 +- source/net/yacy/document/TextParser.java | 16 ++++---- .../yacy/document/parser/audioTagParser.java | 7 ++-- .../yacy/document/parser/genericParser.java | 4 +- .../document/parser/html/ContentScraper.java | 10 ++--- .../parser/images/genericImageParser.java | 14 +++---- .../net/yacy/document/parser/tarParser.java | 3 +- .../schema/CollectionConfiguration.java | 33 ++++++++++++---- .../yacy/search/schema/CollectionSchema.java | 3 +- .../search/schema/WebgraphConfiguration.java | 39 ++++++++----------- .../yacy/search/schema/WebgraphSchema.java | 6 ++- 26 files changed, 136 insertions(+), 116 deletions(-) diff --git a/defaults/solr.collection.schema b/defaults/solr.collection.schema index 4c10cc5b8..a37a5bfff 100644 --- a/defaults/solr.collection.schema +++ b/defaults/solr.collection.schema @@ -334,12 +334,15 @@ underline_txt ## the protocol of the url url_protocol_s -## all path elements in the url -url_paths_sxt +## the file name (which is the string after the last '/' and before the query part from '?' on) without the file extension +url_file_name_s ## the file name extension url_file_ext_s +## all path elements in the url hpath (see: http://www.ietf.org/rfc/rfc1738.txt) without the file name +url_paths_sxt + ## number of key-value pairs in search part of the url #url_parameter_i diff --git a/defaults/solr.webgraph.schema b/defaults/solr.webgraph.schema index dec0dcb02..d2c505d17 100644 --- a/defaults/solr.webgraph.schema +++ b/defaults/solr.webgraph.schema @@ -41,6 +41,9 @@ source_id_s ## the url without the protocol (source) #source_urlstub_s +## the file name without the extension (source) +#source_file_name_s + ## the file name extension (source) #source_file_ext_s @@ -53,7 +56,7 @@ source_id_s ## count of all path elements in the url (source) #source_path_folders_count_i -## all path elements in the url (source) +## all path elements in the url without the file name (source) #source_path_folders_sxt ## number of key-value pairs in search part of the url (source) @@ -132,6 +135,9 @@ target_protocol_s ## the url without the protocol (target) target_urlstub_s +## the file name without the extension (target) +target_file_name_s + ## the file name extension (target) target_file_ext_s @@ -144,7 +150,7 @@ target_file_ext_s ## count of all path elements in the url (target) #target_path_folders_count_i -## all path elements in the url (target) +## all path elements in the url without the file name (target) target_path_folders_sxt ## number of key-value pairs in search part of the url (target) diff --git a/htroot/ViewFile.java b/htroot/ViewFile.java index a67cd4921..223589113 100644 --- a/htroot/ViewFile.java +++ b/htroot/ViewFile.java @@ -185,7 +185,7 @@ public class ViewFile { } final String[] wordArray = wordArray(post.get("words", null)); - + final String ext = MultiProtocolURI.getFileExtension(url.getFileName()); if (viewMode.equals("plain")) { // TODO: how to handle very large files here ? @@ -209,7 +209,6 @@ public class ViewFile { } else if (viewMode.equals("iframeCache")) { prop.put("viewMode", VIEW_MODE_AS_IFRAME_FROM_CACHE); - final String ext = url.getFileExtension(); prop.put("viewMode_png", 0); prop.put("viewMode_html", 0); if (ext.length() > 0 && "jpg.jpeg.png.gif".indexOf(ext) >= 0) { @@ -389,7 +388,7 @@ public class ViewFile { prop.put("error_md5", urlEntry.md5()); prop.put("error_lat", urlEntry.lat()); prop.put("error_lon", urlEntry.lon()); - prop.put("error_doctype", Response.doctype2mime(url.getFileExtension(), urlEntry.doctype())); + prop.put("error_doctype", Response.doctype2mime(ext, urlEntry.doctype())); prop.put("error_language", urlEntry.language()); prop.put("error_flags", urlEntry.flags().toString()); prop.put("error_wordCount", urlEntry.wordCount()); diff --git a/htroot/yacysearchitem.java b/htroot/yacysearchitem.java index 80783193a..dc21ee141 100644 --- a/htroot/yacysearchitem.java +++ b/htroot/yacysearchitem.java @@ -29,6 +29,7 @@ import java.util.List; import net.yacy.cora.date.GenericFormatter; import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.RSSMessage; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.document.analysis.Classification.ContentDomain; @@ -189,6 +190,7 @@ public class yacysearchitem { // prop.putHTML("content_value", Interaction.TripleGet(result.urlstring(), "http://virtual.x/hasvalue", "anonymous")); // END interaction + String resultFileName = resultURL.getFileName(); prop.putHTML("content_target", target); if (faviconURL != null && fileType == FileType.HTML) sb.loader.loadIfNotExistBackground(faviconURL, 1024 * 1024 * 10, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT); prop.putHTML("content_faviconCode", URLLicense.aquireLicense(faviconURL)); // acquire license for favicon url loading @@ -210,7 +212,7 @@ public class yacysearchitem { prop.putHTML("content_sizename", RSSMessage.sizename(result.filesize())); prop.putHTML("content_showSize_sizename", RSSMessage.sizename(result.filesize())); prop.putHTML("content_host", resultURL.getHost() == null ? "" : resultURL.getHost()); - prop.putHTML("content_file", resultURL.getFileName()); + prop.putHTML("content_file", resultFileName); prop.putHTML("content_path", resultURL.getPath()); prop.put("content_nl", (item == theSearch.query.offset) ? 0 : 1); prop.putHTML("content_publisher", result.publisher()); @@ -243,7 +245,7 @@ public class yacysearchitem { prop.put("content_heuristic_name", heuristic.heuristicName); } EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(theSearch.query.id(true), SearchEventType.FINALIZATION, "" + item, 0, 0), false); - final String ext = resultURL.getFileExtension().toLowerCase(); + final String ext = MultiProtocolURI.getFileExtension(resultFileName).toLowerCase(); if (ext.equals("png") || ext.equals("jpg") || ext.equals("gif")) { final String license = URLLicense.aquireLicense(resultURL); prop.put("content_code", license); diff --git a/source/net/yacy/cora/document/MultiProtocolURI.java b/source/net/yacy/cora/document/MultiProtocolURI.java index 394aaa87c..f7dcb5b3d 100644 --- a/source/net/yacy/cora/document/MultiProtocolURI.java +++ b/source/net/yacy/cora/document/MultiProtocolURI.java @@ -269,7 +269,7 @@ public class MultiProtocolURI implements Serializable, Comparable 0); } - public final boolean isCGI() { - final String ls = unescape(this.path.toLowerCase()); - return ls.indexOf(".cgi",0) >= 0 || - ls.indexOf(".exe",0) >= 0; + public static final boolean isCGI(final String extension) { + return "cgi.exe.jpg.jpeg".indexOf(extension.toLowerCase()) >= 0; } - public final boolean isImage() { - final String ext = getFileExtension().toLowerCase(); - return "png.gif.jpg.jpeg".indexOf(ext) >= 0; + public static final boolean isImage(final String extension) { + return "png.gif.jpg.jpeg".indexOf(extension.toLowerCase()) >= 0; } public final boolean isIndividual() { diff --git a/source/net/yacy/cora/document/analysis/Classification.java b/source/net/yacy/cora/document/analysis/Classification.java index 0d32da96b..61bfd5e31 100644 --- a/source/net/yacy/cora/document/analysis/Classification.java +++ b/source/net/yacy/cora/document/analysis/Classification.java @@ -201,10 +201,10 @@ public class Classification { } public static String url2mime(final MultiProtocolURI url, final String dfltMime) { - return url == null ? "application/octet-stream" : ext2mime(url.getFileExtension(), dfltMime); + return url == null ? "application/octet-stream" : ext2mime(MultiProtocolURI.getFileExtension(url.getFileName()), dfltMime); } public static String url2mime(final MultiProtocolURI url) { - return url == null ? "application/octet-stream" : ext2mime(url.getFileExtension()); + return url == null ? "application/octet-stream" : ext2mime(MultiProtocolURI.getFileExtension(url.getFileName())); } } diff --git a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java index 27c4211d8..f6dccd598 100644 --- a/source/net/yacy/cora/federate/solr/SchemaConfiguration.java +++ b/source/net/yacy/cora/federate/solr/SchemaConfiguration.java @@ -146,43 +146,43 @@ public class SchemaConfiguration extends Configuration implements Serializable { } public void add(final SolrInputDocument doc, final SchemaDeclaration key, final String value) { - assert !key.isMultiValued(); + assert !key.isMultiValued() : "key = " + key.getSolrFieldName(); if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && !value.isEmpty()))) key.add(doc, value); } public void add(final SolrInputDocument doc, final SchemaDeclaration key, final Date value) { - assert !key.isMultiValued(); + assert !key.isMultiValued() : "key = " + key.getSolrFieldName(); if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.getTime() > 0))) key.add(doc, value); } public void add(final SolrInputDocument doc, final SchemaDeclaration key, final String[] value) { - assert key.isMultiValued(); + assert key.isMultiValued() : "key = " + key.getSolrFieldName(); if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.length > 0))) key.add(doc, value); } public void add(final SolrInputDocument doc, final SchemaDeclaration key, final Integer[] value) { - assert key.isMultiValued(); + assert key.isMultiValued() : "key = " + key.getSolrFieldName(); if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.length > 0))) key.add(doc, value); } public void add(final SolrInputDocument doc, final SchemaDeclaration key, final List values) { - assert key.isMultiValued(); + assert key.isMultiValued() : "key = " + key.getSolrFieldName(); if ((isEmpty() || contains(key)) && (!this.lazy || (values != null && !values.isEmpty()))) key.add(doc, values); } public void add(final SolrInputDocument doc, final SchemaDeclaration key, final int value) { - assert !key.isMultiValued(); + assert !key.isMultiValued() : "key = " + key.getSolrFieldName(); if ((isEmpty() || contains(key)) && (!this.lazy || value != 0)) key.add(doc, value); } public void add(final SolrInputDocument doc, final SchemaDeclaration key, final long value) { - assert !key.isMultiValued(); + assert !key.isMultiValued() : "key = " + key.getSolrFieldName(); if ((isEmpty() || contains(key)) && (!this.lazy || value != 0)) key.add(doc, value); } public void add(final SolrInputDocument doc, final SchemaDeclaration key, final boolean value) { - assert !key.isMultiValued(); - if (isEmpty() || contains(key)) key.add(doc, value); + assert !key.isMultiValued() : "key = " + key.getSolrFieldName(); + if ((isEmpty() || contains(key)) && (!this.lazy || value)) key.add(doc, value); } public static Date getDate(SolrInputDocument doc, final SchemaDeclaration key) { diff --git a/source/net/yacy/cora/federate/solr/responsewriter/JsonResponseWriter.java b/source/net/yacy/cora/federate/solr/responsewriter/JsonResponseWriter.java index 1dfd38d1c..0aa684f27 100644 --- a/source/net/yacy/cora/federate/solr/responsewriter/JsonResponseWriter.java +++ b/source/net/yacy/cora/federate/solr/responsewriter/JsonResponseWriter.java @@ -148,14 +148,16 @@ public class JsonResponseWriter implements QueryResponseWriter { solitaireTag(writer, stag, value.stringValue()); continue; } - // some special handling here if (CollectionSchema.sku.getSolrFieldName().equals(fieldName)) { String u = value.stringValue(); try { url = new MultiProtocolURI(u); + String filename = url.getFileName(); solitaireTag(writer, "link", u); - solitaireTag(writer, "file", url.getFileName()); + solitaireTag(writer, "file", filename); + // get image license + if (MultiProtocolURI.isImage(filename)) URLLicense.aquireLicense(urlhash, url.toNormalform(true)); } catch (MalformedURLException e) {} continue; } @@ -206,9 +208,6 @@ public class JsonResponseWriter implements QueryResponseWriter { //missing: "code","faviconCode" } - // get image license - if (url.isImage()) URLLicense.aquireLicense(urlhash, url.toNormalform(true)); - // compute snippet from texts solitaireTag(writer, "path", path.toString()); solitaireTag(writer, "title", title.length() == 0 ? (texts.size() == 0 ? path.toString() : texts.get(0)) : title); diff --git a/source/net/yacy/crawler/data/Latency.java b/source/net/yacy/crawler/data/Latency.java index ea5b5ca12..4e5d86918 100644 --- a/source/net/yacy/crawler/data/Latency.java +++ b/source/net/yacy/crawler/data/Latency.java @@ -204,7 +204,7 @@ public class Latency { // for CGI accesses, we double the minimum time // mostly there is a database access in the background // which creates a lot of unwanted IO on target site - if (url.isCGI()) waiting = waiting * 2; + if (MultiProtocolURI.isCGI(url.getFileName())) waiting = waiting * 2; // if we have accessed the domain many times, get slower (the flux factor) if (!local) waiting += host.flux(waiting); @@ -238,7 +238,7 @@ public class Latency { // for CGI accesses, we double the minimum time // mostly there is a database access in the background // which creates a lot of unwanted IO on target site - if (url.isCGI()) { waiting = waiting * 2; s.append(", isCGI = true -> double"); } + if (MultiProtocolURI.isCGI(url.getFileName())) { waiting = waiting * 2; s.append(", isCGI = true -> double"); } // if we have accessed the domain many times, get slower (the flux factor) int flux = host.flux(waiting); diff --git a/source/net/yacy/crawler/data/ResultImages.java b/source/net/yacy/crawler/data/ResultImages.java index 62d76ce5d..1d9fdab5a 100644 --- a/source/net/yacy/crawler/data/ResultImages.java +++ b/source/net/yacy/crawler/data/ResultImages.java @@ -74,7 +74,7 @@ public class ResultImages { image.height() > 100 && image.width() < 1200 && image.height() < 1000 && - !"gif".equals(image.url().getFileExtension())) { + !"gif".equals(MultiProtocolURI.getFileExtension(image.url().getFileName()))) { // && ((urlString.lastIndexOf(".jpg") != -1)) || // ((urlString.lastIndexOf(".png") != -1)){ diff --git a/source/net/yacy/crawler/retrieval/FileLoader.java b/source/net/yacy/crawler/retrieval/FileLoader.java index 526762c28..049f6b031 100644 --- a/source/net/yacy/crawler/retrieval/FileLoader.java +++ b/source/net/yacy/crawler/retrieval/FileLoader.java @@ -31,6 +31,7 @@ import java.util.Date; import java.util.List; import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.cora.document.analysis.Classification; import net.yacy.cora.protocol.HeaderFramework; @@ -94,7 +95,7 @@ public class FileLoader { } // create response header - String mime = Classification.ext2mime(url.getFileExtension()); + String mime = Classification.ext2mime(MultiProtocolURI.getFileExtension(url.getFileName())); ResponseHeader responseHeader = new ResponseHeader(200); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified()))); responseHeader.put(HeaderFramework.CONTENT_TYPE, mime); diff --git a/source/net/yacy/crawler/retrieval/Response.java b/source/net/yacy/crawler/retrieval/Response.java index 5230bbe2b..d398388fc 100644 --- a/source/net/yacy/crawler/retrieval/Response.java +++ b/source/net/yacy/crawler/retrieval/Response.java @@ -70,7 +70,7 @@ public class Response { // doctype calculation public static char docType(final MultiProtocolURI url) { - String ext = url.getFileExtension(); + String ext = MultiProtocolURI.getFileExtension(url.getFileName()); if (ext == null) return DT_UNKNOWN; if (ext.equals(".gif")) return DT_IMAGE; if (ext.equals(".ico")) return DT_IMAGE; @@ -169,7 +169,7 @@ public class Response { // request and response headers may be zero in case that we process surrogates this.requestHeader = new RequestHeader(); this.responseHeader = new ResponseHeader(200); - this.responseHeader.put(HeaderFramework.CONTENT_TYPE, Classification.ext2mime(request.url().getFileExtension(), "text/plain")); // tell parser how to handle the content + this.responseHeader.put(HeaderFramework.CONTENT_TYPE, Classification.ext2mime(MultiProtocolURI.getFileExtension(request.url().getFileName()), "text/plain")); // tell parser how to handle the content if (!request.isEmpty()) this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(request.size())); this.profile = profile; this.status = QUEUE_STATE_FRESH; @@ -291,7 +291,7 @@ public class Response { return "dynamic_post"; } - if (url().isCGI()) { + if (MultiProtocolURI.isCGI(MultiProtocolURI.getFileExtension(url().getFileName()))) { return "dynamic_cgi"; } @@ -390,7 +390,7 @@ public class Response { if (url().isPOST()) { return false; } - if (url().isCGI()) { + if (MultiProtocolURI.isCGI(MultiProtocolURI.getFileExtension(url().getFileName()))) { return false; } @@ -541,7 +541,7 @@ public class Response { if (url().isPOST()) { return "Dynamic_(POST)"; } - if (url().isCGI()) { + if (MultiProtocolURI.isCGI(MultiProtocolURI.getFileExtension(url().getFileName()))) { return "Dynamic_(CGI)"; } } @@ -684,7 +684,7 @@ public class Response { // CGI access makes the page very individual, and therefore not usable in caches if (!profile().crawlingQ()) { if (url().isPOST()) { return "Dynamic_(POST)"; } - if (url().isCGI()) { return "Dynamic_(CGI)"; } + if (MultiProtocolURI.isCGI(MultiProtocolURI.getFileExtension(url().getFileName()))) { return "Dynamic_(CGI)"; } } // -authorization cases in request diff --git a/source/net/yacy/crawler/retrieval/SMBLoader.java b/source/net/yacy/crawler/retrieval/SMBLoader.java index 26c68fc04..3b120c240 100644 --- a/source/net/yacy/crawler/retrieval/SMBLoader.java +++ b/source/net/yacy/crawler/retrieval/SMBLoader.java @@ -113,7 +113,7 @@ public class SMBLoader { } // create response header - String mime = Classification.ext2mime(url.getFileExtension()); + String mime = Classification.ext2mime(MultiProtocolURI.getFileExtension(url.getFileName())); ResponseHeader responseHeader = new ResponseHeader(200); responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified()))); responseHeader.put(HeaderFramework.CONTENT_TYPE, mime); diff --git a/source/net/yacy/data/ymark/YMarkAutoTagger.java b/source/net/yacy/data/ymark/YMarkAutoTagger.java index 92ddcf083..f72266176 100644 --- a/source/net/yacy/data/ymark/YMarkAutoTagger.java +++ b/source/net/yacy/data/ymark/YMarkAutoTagger.java @@ -10,6 +10,7 @@ import java.util.TreeMap; import java.util.TreeSet; import java.util.concurrent.ArrayBlockingQueue; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.federate.yacy.CacheStrategy; import net.yacy.cora.protocol.ClientIdentification; import net.yacy.crawler.retrieval.Response; @@ -161,7 +162,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle } final String clean = YMarkUtil.cleanTagsString(buffer.toString()); if(clean.equals(YMarkEntry.BOOKMARK.TAGS.deflt())) { - return document.getFileExtension(); + return MultiProtocolURI.getFileExtension(document.dc_source().getFileName()); } return clean; } finally { diff --git a/source/net/yacy/document/Document.java b/source/net/yacy/document/Document.java index d5e7bbda6..6d5e76fbe 100644 --- a/source/net/yacy/document/Document.java +++ b/source/net/yacy/document/Document.java @@ -153,8 +153,8 @@ public class Document { return this.languages; } - public String getFileExtension() { - return this.source.getFileExtension(); + public String getFileName() { + return this.source.getFileName(); } public Map> getGenericFacets() { diff --git a/source/net/yacy/document/LibraryProvider.java b/source/net/yacy/document/LibraryProvider.java index 8b31a6363..c7771a973 100644 --- a/source/net/yacy/document/LibraryProvider.java +++ b/source/net/yacy/document/LibraryProvider.java @@ -90,7 +90,7 @@ public class LibraryProvider { private Dictionary(final String nickname, final String url) { try { - this.filename = new MultiProtocolURI(url).getFileName(); + this.filename = (new MultiProtocolURI(url)).getFileName(); } catch ( final MalformedURLException e ) { assert false; } diff --git a/source/net/yacy/document/TextParser.java b/source/net/yacy/document/TextParser.java index 25f74a7d2..cb965d46e 100644 --- a/source/net/yacy/document/TextParser.java +++ b/source/net/yacy/document/TextParser.java @@ -194,7 +194,7 @@ public final class TextParser { try { idioms = parsers(location, mimeType); } catch (final Parser.Failure e) { - final String errorMsg = "Parser Failure for extension '" + location.getFileExtension() + "' or mimetype '" + mimeType + "': " + e.getMessage(); + final String errorMsg = "Parser Failure for extension '" + MultiProtocolURI.getFileExtension(location.getFileName()) + "' or mimetype '" + mimeType + "': " + e.getMessage(); AbstractParser.log.logWarning(errorMsg); throw new Parser.Failure(errorMsg, location); } @@ -218,7 +218,7 @@ public final class TextParser { try { idioms = parsers(location, mimeType); } catch (final Parser.Failure e) { - final String errorMsg = "Parser Failure for extension '" + location.getFileExtension() + "' or mimetype '" + mimeType + "': " + e.getMessage(); + final String errorMsg = "Parser Failure for extension '" + MultiProtocolURI.getFileExtension(location.getFileName()) + "' or mimetype '" + mimeType + "': " + e.getMessage(); AbstractParser.log.logWarning(errorMsg); throw new Parser.Failure(errorMsg, location); } @@ -252,7 +252,7 @@ public final class TextParser { final InputStream sourceStream ) throws Parser.Failure { if (AbstractParser.log.isFine()) AbstractParser.log.logFine("Parsing '" + location + "' from stream"); - final String fileExt = location.getFileExtension(); + final String fileExt = MultiProtocolURI.getFileExtension(location.getFileName()); final String documentCharset = htmlParser.patchCharsetEncoding(charset); assert parser != null; @@ -272,7 +272,7 @@ public final class TextParser { final String charset, final byte[] sourceArray ) throws Parser.Failure { - final String fileExt = location.getFileExtension(); + final String fileExt = MultiProtocolURI.getFileExtension(location.getFileName()); if (AbstractParser.log.isFine()) AbstractParser.log.logFine("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "' from byte[]"); final String documentCharset = htmlParser.patchCharsetEncoding(charset); assert !parsers.isEmpty(); @@ -312,7 +312,7 @@ public final class TextParser { if (docs == null) { if (failedParser.isEmpty()) { - final String errorMsg = "Parsing content with file extension '" + location.getFileExtension() + "' and mimetype '" + mimeType + "' failed."; + final String errorMsg = "Parsing content with file extension '" + fileExt + "' and mimetype '" + mimeType + "' failed."; //log.logWarning("Unable to parse '" + location + "'. " + errorMsg); throw new Parser.Failure(errorMsg, location); } @@ -362,7 +362,7 @@ public final class TextParser { final Set idioms = new HashSet(2); // check extension - String ext = url.getFileExtension(); + String ext = MultiProtocolURI.getFileExtension(url.getFileName()); Set idiom; if (ext != null && ext.length() > 0) { ext = ext.toLowerCase(); @@ -428,11 +428,11 @@ public final class TextParser { * @return an error if the extension is not supported, null otherwise */ public static String supportsExtension(final MultiProtocolURI url) { - return supportsExtension(url.getFileExtension().toLowerCase()); + return supportsExtension(MultiProtocolURI.getFileExtension(url.getFileName()).toLowerCase()); } public static String mimeOf(final MultiProtocolURI url) { - return mimeOf(url.getFileExtension()); + return mimeOf(MultiProtocolURI.getFileExtension(url.getFileName())); } public static String mimeOf(final String ext) { diff --git a/source/net/yacy/document/parser/audioTagParser.java b/source/net/yacy/document/parser/audioTagParser.java index dd3cc2d44..aef05c58d 100644 --- a/source/net/yacy/document/parser/audioTagParser.java +++ b/source/net/yacy/document/parser/audioTagParser.java @@ -72,8 +72,9 @@ public class audioTagParser extends AbstractParser implements Parser { final String charset, final InputStream source) throws Parser.Failure, InterruptedException { - final String filename = location.getFileName().isEmpty() ? location.toTokens() : MultiProtocolURI.unescape(location.getFileName()); - final String fileext = '.'+location.getFileExtension(); + String filename = location.getFileName(); + final String fileext = '.' + MultiProtocolURI.getFileExtension(filename); + filename = filename.isEmpty() ? location.toTokens() : MultiProtocolURI.unescape(filename); String mime = mimeType; // fix mimeType @@ -190,7 +191,7 @@ public class audioTagParser extends AbstractParser implements Parser { this, null, null, - singleList(location.getFileName().isEmpty() ? location.toTokens() : MultiProtocolURI.unescape(location.getFileName())), // title + singleList(filename), // title "", // author location.getHost(), null, diff --git a/source/net/yacy/document/parser/genericParser.java b/source/net/yacy/document/parser/genericParser.java index 9ad666367..359746844 100644 --- a/source/net/yacy/document/parser/genericParser.java +++ b/source/net/yacy/document/parser/genericParser.java @@ -47,7 +47,7 @@ public class genericParser extends AbstractParser implements Parser { public Document[] parse(final DigestURI location, final String mimeType, final String charset, final InputStream source1) throws Parser.Failure, InterruptedException { - + String filename = location.getFileName(); final Document[] docs = new Document[]{new Document( location, mimeType, @@ -55,7 +55,7 @@ public class genericParser extends AbstractParser implements Parser { this, null, null, - singleList(location.getFileName().isEmpty() ? location.toTokens() : MultiProtocolURI.unescape(location.getFileName())), // title + singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURI.unescape(filename)), // title "", // author location.getHost(), null, diff --git a/source/net/yacy/document/parser/html/ContentScraper.java b/source/net/yacy/document/parser/html/ContentScraper.java index 854e5666d..1d10b350e 100644 --- a/source/net/yacy/document/parser/html/ContentScraper.java +++ b/source/net/yacy/document/parser/html/ContentScraper.java @@ -473,10 +473,8 @@ public class ContentScraper extends AbstractScraper implements Scraper { final String href = tagopts.getProperty("href", EMPTY_STRING); DigestURI url; if ((href.length() > 0) && ((url = absolutePath(href)) != null)) { - final String f = url.getFileName(); - final int p = f.lastIndexOf('.'); - final String type = (p < 0) ? EMPTY_STRING : f.substring(p + 1); - if (type.equals("png") || type.equals("gif") || type.equals("jpg") || type.equals("jpeg") || type.equals("tiff") || type.equals("tif")) { + final String ext = MultiProtocolURI.getFileExtension(url.getFileName()); + if (ext.equals("png") || ext.equals("gif") || ext.equals("jpg") || ext.equals("jpeg") || ext.equals("tiff") || ext.equals("tif")) { // special handling of such urls: put them to the image urls final ImageEntry ie = new ImageEntry(url, recursiveParse(text), -1, -1, -1); addImage(this.images, ie); @@ -656,7 +654,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { String ext; ArrayList f = new ArrayList(); for (final DigestURI url: this.anchors.keySet()) { - ext = url.getFileExtension(); + ext = MultiProtocolURI.getFileExtension(url.getFileName()); if (ext == null) continue; if (ext.equals("swf")) f.add(url); } @@ -666,7 +664,7 @@ public class ContentScraper extends AbstractScraper implements Scraper { public boolean containsFlash() { String ext; for (final MultiProtocolURI url: this.anchors.keySet()) { - ext = url.getFileExtension(); + ext = MultiProtocolURI.getFileExtension(url.getFileName()); if (ext == null) continue; if (ext.equals("swf")) return true; } diff --git a/source/net/yacy/document/parser/images/genericImageParser.java b/source/net/yacy/document/parser/images/genericImageParser.java index 8d306ebfe..7203f75ae 100644 --- a/source/net/yacy/document/parser/images/genericImageParser.java +++ b/source/net/yacy/document/parser/images/genericImageParser.java @@ -99,8 +99,9 @@ public class genericImageParser extends AbstractParser implements Parser { String author = null; String keywords = null; String description = null; - if (mimeType.equals("image/bmp") || - location.getFileExtension().equalsIgnoreCase("bmp")) { + String filename = location.getFileName(); + String ext = MultiProtocolURI.getFileExtension(filename); + if (mimeType.equals("image/bmp") || ext.equalsIgnoreCase("bmp")) { byte[] b; try { b = FileUtils.read(sourceStream); @@ -110,10 +111,7 @@ public class genericImageParser extends AbstractParser implements Parser { } final IMAGEMAP imap = bmpParser.parse(b); ii = parseJavaImage(location, imap.getImage()); - } else if (mimeType.equals("image/jpeg") || - location.getFileExtension().equalsIgnoreCase("jpg") || - location.getFileExtension().equalsIgnoreCase("jpeg") || - location.getFileExtension().equalsIgnoreCase("jpe")) { + } else if (mimeType.equals("image/jpeg") || ext.equalsIgnoreCase("jpg") || ext.equalsIgnoreCase("jpeg") || ext.equalsIgnoreCase("jpe")) { // use the exif parser from // http://www.drewnoakes.com/drewnoakes.com/code/exif/ // javadoc is at: http://www.drewnoakes.com/drewnoakes.com/code/exif/javadoc/ @@ -190,7 +188,7 @@ public class genericImageParser extends AbstractParser implements Parser { final String infoString = ii.info.toString(); images.put(ii.location, new ImageEntry(location, "", ii.width, ii.height, -1)); - if (title == null || title.isEmpty()) title = MultiProtocolURI.unescape(location.getFileName()); + if (title == null || title.isEmpty()) title = MultiProtocolURI.unescape(filename); return new Document[]{new Document( location, @@ -297,7 +295,7 @@ public class genericImageParser extends AbstractParser implements Parser { DigestURI uri; try { uri = new DigestURI("http://localhost/" + image.getName()); - final Document[] document = parser.parse(uri, "image/" + uri.getFileExtension(), "UTF-8", new FileInputStream(image)); + final Document[] document = parser.parse(uri, "image/" + MultiProtocolURI.getFileExtension(uri.getFileName()), "UTF-8", new FileInputStream(image)); System.out.println(document[0].toString()); } catch (final MalformedURLException e) { e.printStackTrace(); diff --git a/source/net/yacy/document/parser/tarParser.java b/source/net/yacy/document/parser/tarParser.java index d2507cf15..8f03b6b85 100644 --- a/source/net/yacy/document/parser/tarParser.java +++ b/source/net/yacy/document/parser/tarParser.java @@ -33,6 +33,7 @@ import java.util.ArrayList; import java.util.List; import java.util.zip.GZIPInputStream; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.document.UTF8; import net.yacy.document.AbstractParser; import net.yacy.document.Document; @@ -64,7 +65,7 @@ public class tarParser extends AbstractParser implements Parser { final List docacc = new ArrayList(); Document[] subDocs = null; - final String ext = url.getFileExtension().toLowerCase(); + final String ext = MultiProtocolURI.getFileExtension(url.getFileName()).toLowerCase(); if (ext.equals("gz") || ext.equals("tgz")) { try { source = new GZIPInputStream(source); diff --git a/source/net/yacy/search/schema/CollectionConfiguration.java b/source/net/yacy/search/schema/CollectionConfiguration.java index c93142c5d..b75ae0459 100644 --- a/source/net/yacy/search/schema/CollectionConfiguration.java +++ b/source/net/yacy/search/schema/CollectionConfiguration.java @@ -35,10 +35,10 @@ import java.util.Date; import java.util.HashMap; import java.util.HashSet; import java.util.Iterator; -import java.util.LinkedHashMap; import java.util.LinkedHashSet; import java.util.List; import java.util.Map; +import java.util.Properties; import java.util.Set; import java.util.TreeMap; import java.util.concurrent.BlockingQueue; @@ -79,6 +79,7 @@ import net.yacy.kelondro.util.Bitfield; import net.yacy.search.index.Segment; import net.yacy.search.index.Segment.ReferenceReport; import net.yacy.search.index.Segment.ReferenceReportCache; +import net.yacy.search.schema.WebgraphConfiguration.Subgraph; import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrInputDocument; @@ -256,8 +257,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri add(doc, CollectionSchema.description_words_val, cv); } + String filename = digestURI.getFileName(); + String extension = MultiProtocolURI.getFileExtension(filename); if (allAttr || contains(CollectionSchema.author)) add(doc, CollectionSchema.author, md.dc_creator()); - if (allAttr || contains(CollectionSchema.content_type)) add(doc, CollectionSchema.content_type, Response.doctype2mime(digestURI.getFileExtension(), md.doctype())); + if (allAttr || contains(CollectionSchema.content_type)) add(doc, CollectionSchema.content_type, Response.doctype2mime(extension, md.doctype())); if (allAttr || contains(CollectionSchema.last_modified)) add(doc, CollectionSchema.last_modified, md.moddate()); if (allAttr || contains(CollectionSchema.wordcount_i)) add(doc, CollectionSchema.wordcount_i, md.wordCount()); @@ -274,7 +277,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri // path elements of link if (allAttr || contains(CollectionSchema.url_paths_sxt)) add(doc, CollectionSchema.url_paths_sxt, digestURI.getPaths()); - if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, digestURI.getFileExtension()); + if (allAttr || contains(CollectionSchema.url_file_name_s)) add(doc, CollectionSchema.url_file_name_s, filename.toLowerCase().endsWith("." + extension) ? filename.substring(0, filename.length() - extension.length() - 1) : filename); + if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, extension); if (allAttr || contains(CollectionSchema.imagescount_i)) add(doc, CollectionSchema.imagescount_i, md.limage()); if (allAttr || contains(CollectionSchema.inboundlinkscount_i)) add(doc, CollectionSchema.inboundlinkscount_i, md.llocal()); @@ -474,8 +478,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri add(doc, CollectionSchema.fuzzy_signature_unique_b, true); // this must be corrected afterwards! // path elements of link + String filename = digestURI.getFileName(); + String extension = MultiProtocolURI.getFileExtension(filename); if (allAttr || contains(CollectionSchema.url_paths_sxt)) add(doc, CollectionSchema.url_paths_sxt, digestURI.getPaths()); - if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, digestURI.getFileExtension()); + if (allAttr || contains(CollectionSchema.url_file_name_s)) add(doc, CollectionSchema.url_file_name_s, filename.toLowerCase().endsWith("." + extension) ? filename.substring(0, filename.length() - extension.length() - 1) : filename); + if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, extension); // get list of all links; they will be shrinked by urls that appear in other fields of the solr schema Set inboundLinks = document.inboundLinks(); @@ -695,8 +702,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri outboundLinks.remove(canonical); add(doc, CollectionSchema.canonical_s, canonical.toNormalform(false)); // set a flag if this is equal to sku - if (contains(CollectionSchema.canonical_equal_sku_b) && canonical.equals(docurl)) { - add(doc, CollectionSchema.canonical_equal_sku_b, true); + if (contains(CollectionSchema.canonical_equal_sku_b)) { + add(doc, CollectionSchema.canonical_equal_sku_b, canonical.equals(docurl)); } } } @@ -784,9 +791,16 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (allAttr || contains(CollectionSchema.inboundlinksnofollowcount_i)) add(doc, CollectionSchema.inboundlinksnofollowcount_i, document.inboundLinkNofollowCount()); if (allAttr || contains(CollectionSchema.outboundlinkscount_i)) add(doc, CollectionSchema.outboundlinkscount_i, outboundLinks.size()); if (allAttr || contains(CollectionSchema.outboundlinksnofollowcount_i)) add(doc, CollectionSchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount()); + Map alllinks = document.getAnchors(); + // create a subgraph + Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size()); + //if () { + webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, inboundLinks, citations); + webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, false, outboundLinks, citations); + //} + // list all links - WebgraphConfiguration.Subgraph subgraph = webgraph.edges(digestURI, responseHeader, collections, clickdepth, document.getAnchors(), images, inboundLinks, outboundLinks, citations); doc.webgraphDocuments.addAll(subgraph.edges); if (allAttr || contains(CollectionSchema.inboundlinks_protocol_sxt)) add(doc, CollectionSchema.inboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[0])); if (allAttr || contains(CollectionSchema.inboundlinks_urlstub_txt)) add(doc, CollectionSchema.inboundlinks_urlstub_txt, subgraph.urlStubs[0]); @@ -1164,8 +1178,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri if (contains(CollectionSchema.load_date_dt)) add(solrdoc, CollectionSchema.load_date_dt, new Date()); // path elements of link + String filename = digestURI.getFileName(); + String extension = MultiProtocolURI.getFileExtension(filename); if (contains(CollectionSchema.url_paths_sxt)) add(solrdoc, CollectionSchema.url_paths_sxt, digestURI.getPaths()); - if (contains(CollectionSchema.url_file_ext_s)) add(solrdoc, CollectionSchema.url_file_ext_s, digestURI.getFileExtension()); + if (contains(CollectionSchema.url_file_name_s)) add(solrdoc, CollectionSchema.url_file_name_s, filename.toLowerCase().endsWith("." + extension) ? filename.substring(0, filename.length() - extension.length() - 1) : filename); + if (contains(CollectionSchema.url_file_ext_s)) add(solrdoc, CollectionSchema.url_file_ext_s, extension); // fail reason and status if (contains(CollectionSchema.failreason_s)) add(solrdoc, CollectionSchema.failreason_s, failReason); diff --git a/source/net/yacy/search/schema/CollectionSchema.java b/source/net/yacy/search/schema/CollectionSchema.java index 1e49c8148..295171f6a 100644 --- a/source/net/yacy/search/schema/CollectionSchema.java +++ b/source/net/yacy/search/schema/CollectionSchema.java @@ -152,8 +152,9 @@ public enum CollectionSchema implements SchemaDeclaration { publisher_url_s(SolrType.string, true, true, false, false, false, "publisher url as defined in http://support.google.com/plus/answer/1713826?hl=de"), url_protocol_s(SolrType.string, true, true, false, false, false, "the protocol of the url"), - url_paths_sxt(SolrType.string, true, true, true, false, true, "all path elements in the url"), + url_file_name_s(SolrType.string, true, true, false, false, false, "the file name (which is the string after the last '/' and before the query part from '?' on) without the file extension"), url_file_ext_s(SolrType.string, true, true, false, false, false, "the file name extension"), + url_paths_sxt(SolrType.string, true, true, true, false, true, "all path elements in the url hpath (see: http://www.ietf.org/rfc/rfc1738.txt) without the file name"), url_parameter_i(SolrType.num_integer, true, true, false, false, false, "number of key-value pairs in search part of the url"), url_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url"), url_parameter_value_sxt(SolrType.string, true, true, true, false, false, "the values from key-value pairs in the search part of the url"), diff --git a/source/net/yacy/search/schema/WebgraphConfiguration.java b/source/net/yacy/search/schema/WebgraphConfiguration.java index bd377175d..7773fb9a5 100644 --- a/source/net/yacy/search/schema/WebgraphConfiguration.java +++ b/source/net/yacy/search/schema/WebgraphConfiguration.java @@ -42,6 +42,7 @@ import org.apache.solr.common.SolrDocument; import org.apache.solr.common.SolrInputDocument; import net.yacy.cora.document.ASCII; +import net.yacy.cora.document.MultiProtocolURI; import net.yacy.cora.federate.solr.ProcessType; import net.yacy.cora.federate.solr.SchemaConfiguration; import net.yacy.cora.federate.solr.SchemaDeclaration; @@ -111,31 +112,13 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial } } - public Subgraph edges( - final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth_source, - final Map alllinks, - final Map images, - final Set inboundLinks, - final Set outboundLinks, - IndexCell citations - ) { - boolean allAttr = this.isEmpty(); - Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size()); - addEdges( - subgraph, source, responseHeader, collections, clickdepth_source, - allAttr, alllinks, images, true, inboundLinks, citations); - addEdges( - subgraph, source, responseHeader, collections, clickdepth_source, - allAttr, alllinks, images, false, outboundLinks, citations); - return subgraph; - } - - private void addEdges( + public void addEdges( final Subgraph subgraph, final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth_source, - final boolean allAttr, final Map alllinks, final Map images, + final Map alllinks, final Map images, final boolean inbound, final Set links, final IndexCell citations) { + boolean allAttr = this.isEmpty(); for (final DigestURI target_url: links) { Set processTypes = new LinkedHashSet(); @@ -194,7 +177,12 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial if (allAttr || contains(WebgraphSchema.source_host_organizationdnc_s)) add(edge, WebgraphSchema.source_host_organizationdnc_s, orga + '.' + dnc); if (allAttr || contains(WebgraphSchema.source_host_subdomain_s)) add(edge, WebgraphSchema.source_host_subdomain_s, subdom); } - if (allAttr || contains(WebgraphSchema.source_file_ext_s)) add(edge, WebgraphSchema.source_file_ext_s, source.getFileExtension()); + if (allAttr || contains(WebgraphSchema.source_file_ext_s) || contains(WebgraphSchema.source_file_name_s)) { + String source_file_name = source.getFileName(); + String source_file_ext = MultiProtocolURI.getFileExtension(source_file_name); + add(edge, WebgraphSchema.source_file_name_s, source_file_name.toLowerCase().endsWith("." + source_file_ext) ? source_file_name.substring(0, source_file_name.length() - source_file_ext.length() - 1) : source_file_name); + add(edge, WebgraphSchema.source_file_ext_s, source_file_ext); + } if (allAttr || contains(WebgraphSchema.source_path_s)) add(edge, WebgraphSchema.source_path_s, source.getPath()); if (allAttr || contains(WebgraphSchema.source_path_folders_count_i) || contains(WebgraphSchema.source_path_folders_sxt)) { String[] paths = source.getPaths(); @@ -251,7 +239,12 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial if (allAttr || contains(WebgraphSchema.target_host_organizationdnc_s)) add(edge, WebgraphSchema.target_host_organizationdnc_s, orga + '.' + dnc); if (allAttr || contains(WebgraphSchema.target_host_subdomain_s)) add(edge, WebgraphSchema.target_host_subdomain_s, subdom); } - if (allAttr || contains(WebgraphSchema.target_file_ext_s)) add(edge, WebgraphSchema.target_file_ext_s, target_url.getFileExtension()); + if (allAttr || contains(WebgraphSchema.target_file_ext_s) || contains(WebgraphSchema.target_file_name_s)) { + String target_file_name = target_url.getFileName(); + String target_file_ext = MultiProtocolURI.getFileExtension(target_file_name); + add(edge, WebgraphSchema.target_file_name_s, target_file_name.toLowerCase().endsWith("." + target_file_ext) ? target_file_name.substring(0, target_file_name.length() - target_file_ext.length() - 1) : target_file_name); + add(edge, WebgraphSchema.target_file_ext_s, target_file_ext); + } if (allAttr || contains(WebgraphSchema.target_path_s)) add(edge, WebgraphSchema.target_path_s, target_url.getPath()); if (allAttr || contains(WebgraphSchema.target_path_folders_count_i) || contains(WebgraphSchema.target_path_folders_sxt)) { String[] paths = target_url.getPaths(); diff --git a/source/net/yacy/search/schema/WebgraphSchema.java b/source/net/yacy/search/schema/WebgraphSchema.java index 15d257263..3086edf50 100644 --- a/source/net/yacy/search/schema/WebgraphSchema.java +++ b/source/net/yacy/search/schema/WebgraphSchema.java @@ -41,11 +41,12 @@ public enum WebgraphSchema implements SchemaDeclaration { source_id_s(SolrType.string, true, true, false, false, false, "primary key of document, the URL hash (source)"), source_protocol_s(SolrType.string, true, true, false, false, false, "the protocol of the url (source)"), source_urlstub_s(SolrType.string, true, true, false, false, false, "the url without the protocol (source)"), + source_file_name_s(SolrType.string, true, true, false, false, false, "the file name without the extension (source)"), source_file_ext_s(SolrType.string, true, true, false, false, false, "the file name extension (source)"), source_chars_i(SolrType.num_integer, true, true, false, false, false, "number of all characters in the url (source)"), source_path_s(SolrType.string, true, true, false, false, false, "path of the url (source)"), source_path_folders_count_i(SolrType.num_integer, true, true, false, false, false, "count of all path elements in the url (source)"), - source_path_folders_sxt(SolrType.string, true, true, true, false, false, "all path elements in the url (source)"), + source_path_folders_sxt(SolrType.string, true, true, true, false, false, "all path elements in the url without the file name (source)"), source_parameter_count_i(SolrType.num_integer, true, true, false, false, false, "number of key-value pairs in search part of the url (source)"), source_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (source)"), source_parameter_value_sxt(SolrType.string, true, true, true, false, false, "the values from key-value pairs in the search part of the url (source)"), @@ -73,11 +74,12 @@ public enum WebgraphSchema implements SchemaDeclaration { target_id_s(SolrType.string, true, true, false, false, false, "primary key of document, the URL hash (target)"), target_protocol_s(SolrType.string, true, true, false, false, false, "the protocol of the url (target)"), target_urlstub_s(SolrType.string, true, true, false, false, false, "the url without the protocol (target)"), + target_file_name_s(SolrType.string, true, true, false, false, false, "the file name without the extension (target)"), target_file_ext_s(SolrType.string, true, true, false, false, true, "the file name extension (target)"), target_chars_i(SolrType.num_integer, true, true, false, false, false, "number of all characters in the url (target)"), target_path_s(SolrType.string, true, true, false, false, false, "path of the url (target)"), target_path_folders_count_i(SolrType.num_integer, true, true, false, false, false, "count of all path elements in the url (target)"), - target_path_folders_sxt(SolrType.string, true, true, true, false, true, "all path elements in the url (target)"), + target_path_folders_sxt(SolrType.string, true, true, true, false, true, "all path elements in the url without the file name (target)"), target_parameter_count_i(SolrType.num_integer, true, true, false, false, false, "number of key-value pairs in search part of the url (target)"), target_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (target)"), target_parameter_value_sxt(SolrType.string, true, true, true, false, true, "the values from key-value pairs in the search part of the url (target)"),