added url_file_name_s in default collection schema for the file name

without the file extension. This part of the file path is removed from
the multi-field url_paths_sxt, which has now not the file name as last
part of the path list.

The same applies to the new fields source_file_name_s and
target_file_name_s in the webgraph schema.
pull/1/head
Michael Peter Christen 12 years ago
parent 8d1c4c423d
commit 16d1d744fa

@ -334,12 +334,15 @@ underline_txt
## the protocol of the url
url_protocol_s
## all path elements in the url
url_paths_sxt
## the file name (which is the string after the last '/' and before the query part from '?' on) without the file extension
url_file_name_s
## the file name extension
url_file_ext_s
## all path elements in the url hpath (see: http://www.ietf.org/rfc/rfc1738.txt) without the file name
url_paths_sxt
## number of key-value pairs in search part of the url
#url_parameter_i

@ -41,6 +41,9 @@ source_id_s
## the url without the protocol (source)
#source_urlstub_s
## the file name without the extension (source)
#source_file_name_s
## the file name extension (source)
#source_file_ext_s
@ -53,7 +56,7 @@ source_id_s
## count of all path elements in the url (source)
#source_path_folders_count_i
## all path elements in the url (source)
## all path elements in the url without the file name (source)
#source_path_folders_sxt
## number of key-value pairs in search part of the url (source)
@ -132,6 +135,9 @@ target_protocol_s
## the url without the protocol (target)
target_urlstub_s
## the file name without the extension (target)
target_file_name_s
## the file name extension (target)
target_file_ext_s
@ -144,7 +150,7 @@ target_file_ext_s
## count of all path elements in the url (target)
#target_path_folders_count_i
## all path elements in the url (target)
## all path elements in the url without the file name (target)
target_path_folders_sxt
## number of key-value pairs in search part of the url (target)

@ -185,7 +185,7 @@ public class ViewFile {
}
final String[] wordArray = wordArray(post.get("words", null));
final String ext = MultiProtocolURI.getFileExtension(url.getFileName());
if (viewMode.equals("plain")) {
// TODO: how to handle very large files here ?
@ -209,7 +209,6 @@ public class ViewFile {
} else if (viewMode.equals("iframeCache")) {
prop.put("viewMode", VIEW_MODE_AS_IFRAME_FROM_CACHE);
final String ext = url.getFileExtension();
prop.put("viewMode_png", 0);
prop.put("viewMode_html", 0);
if (ext.length() > 0 && "jpg.jpeg.png.gif".indexOf(ext) >= 0) {
@ -389,7 +388,7 @@ public class ViewFile {
prop.put("error_md5", urlEntry.md5());
prop.put("error_lat", urlEntry.lat());
prop.put("error_lon", urlEntry.lon());
prop.put("error_doctype", Response.doctype2mime(url.getFileExtension(), urlEntry.doctype()));
prop.put("error_doctype", Response.doctype2mime(ext, urlEntry.doctype()));
prop.put("error_language", urlEntry.language());
prop.put("error_flags", urlEntry.flags().toString());
prop.put("error_wordCount", urlEntry.wordCount());

@ -29,6 +29,7 @@ import java.util.List;
import net.yacy.cora.date.GenericFormatter;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.RSSMessage;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.document.analysis.Classification.ContentDomain;
@ -189,6 +190,7 @@ public class yacysearchitem {
// prop.putHTML("content_value", Interaction.TripleGet(result.urlstring(), "http://virtual.x/hasvalue", "anonymous"));
// END interaction
String resultFileName = resultURL.getFileName();
prop.putHTML("content_target", target);
if (faviconURL != null && fileType == FileType.HTML) sb.loader.loadIfNotExistBackground(faviconURL, 1024 * 1024 * 10, null, TextSnippet.snippetMinLoadDelay, ClientIdentification.DEFAULT_TIMEOUT);
prop.putHTML("content_faviconCode", URLLicense.aquireLicense(faviconURL)); // acquire license for favicon url loading
@ -210,7 +212,7 @@ public class yacysearchitem {
prop.putHTML("content_sizename", RSSMessage.sizename(result.filesize()));
prop.putHTML("content_showSize_sizename", RSSMessage.sizename(result.filesize()));
prop.putHTML("content_host", resultURL.getHost() == null ? "" : resultURL.getHost());
prop.putHTML("content_file", resultURL.getFileName());
prop.putHTML("content_file", resultFileName);
prop.putHTML("content_path", resultURL.getPath());
prop.put("content_nl", (item == theSearch.query.offset) ? 0 : 1);
prop.putHTML("content_publisher", result.publisher());
@ -243,7 +245,7 @@ public class yacysearchitem {
prop.put("content_heuristic_name", heuristic.heuristicName);
}
EventTracker.update(EventTracker.EClass.SEARCH, new ProfilingGraph.EventSearch(theSearch.query.id(true), SearchEventType.FINALIZATION, "" + item, 0, 0), false);
final String ext = resultURL.getFileExtension().toLowerCase();
final String ext = MultiProtocolURI.getFileExtension(resultFileName).toLowerCase();
if (ext.equals("png") || ext.equals("jpg") || ext.equals("gif")) {
final String license = URLLicense.aquireLicense(resultURL);
prop.put("content_code", license);

@ -269,7 +269,7 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
public final ContentDomain getContentDomain() {
if (this.contentDomain == null) {
this.contentDomain = Classification.getContentDomain(this.getFileExtension());
this.contentDomain = Classification.getContentDomain(getFileExtension(this.getFileName()));
}
return this.contentDomain;
}
@ -711,14 +711,10 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
return this.path.substring(p + 1); // the 'real' file name
}
public String getFileExtension() {
return getFileExtension(getFileName());
}
public static String getFileExtension(final String fileName) {
final int p = fileName.lastIndexOf('.');
if (p < 0) return "";
return fileName.substring(p + 1);
return fileName.substring(p + 1).toLowerCase();
}
public String getPath() {
@ -726,7 +722,12 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
}
public String[] getPaths() {
return this.path == null ? null : this.path.charAt(0) == '/' ? CommonPattern.SLASH.split(this.path.substring(1)) : CommonPattern.SLASH.split(this.path);
String s = this.path == null ? "" : this.path.charAt(0) == '/' ? this.path.substring(1) : this.path;
int p = s.lastIndexOf('/');
if (p < 0) return new String[0];
s = s.substring(0, p); // the paths do not contain the last part, which is considered as the getFileName() part.
String[] paths = CommonPattern.SLASH.split(s);
return paths;
}
/**
@ -973,15 +974,12 @@ public class MultiProtocolURI implements Serializable, Comparable<MultiProtocolU
return (this.searchpart != null) && (this.searchpart.length() > 0);
}
public final boolean isCGI() {
final String ls = unescape(this.path.toLowerCase());
return ls.indexOf(".cgi",0) >= 0 ||
ls.indexOf(".exe",0) >= 0;
public static final boolean isCGI(final String extension) {
return "cgi.exe.jpg.jpeg".indexOf(extension.toLowerCase()) >= 0;
}
public final boolean isImage() {
final String ext = getFileExtension().toLowerCase();
return "png.gif.jpg.jpeg".indexOf(ext) >= 0;
public static final boolean isImage(final String extension) {
return "png.gif.jpg.jpeg".indexOf(extension.toLowerCase()) >= 0;
}
public final boolean isIndividual() {

@ -201,10 +201,10 @@ public class Classification {
}
public static String url2mime(final MultiProtocolURI url, final String dfltMime) {
return url == null ? "application/octet-stream" : ext2mime(url.getFileExtension(), dfltMime);
return url == null ? "application/octet-stream" : ext2mime(MultiProtocolURI.getFileExtension(url.getFileName()), dfltMime);
}
public static String url2mime(final MultiProtocolURI url) {
return url == null ? "application/octet-stream" : ext2mime(url.getFileExtension());
return url == null ? "application/octet-stream" : ext2mime(MultiProtocolURI.getFileExtension(url.getFileName()));
}
}

@ -146,43 +146,43 @@ public class SchemaConfiguration extends Configuration implements Serializable {
}
public void add(final SolrInputDocument doc, final SchemaDeclaration key, final String value) {
assert !key.isMultiValued();
assert !key.isMultiValued() : "key = " + key.getSolrFieldName();
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && !value.isEmpty()))) key.add(doc, value);
}
public void add(final SolrInputDocument doc, final SchemaDeclaration key, final Date value) {
assert !key.isMultiValued();
assert !key.isMultiValued() : "key = " + key.getSolrFieldName();
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.getTime() > 0))) key.add(doc, value);
}
public void add(final SolrInputDocument doc, final SchemaDeclaration key, final String[] value) {
assert key.isMultiValued();
assert key.isMultiValued() : "key = " + key.getSolrFieldName();
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.length > 0))) key.add(doc, value);
}
public void add(final SolrInputDocument doc, final SchemaDeclaration key, final Integer[] value) {
assert key.isMultiValued();
assert key.isMultiValued() : "key = " + key.getSolrFieldName();
if ((isEmpty() || contains(key)) && (!this.lazy || (value != null && value.length > 0))) key.add(doc, value);
}
public void add(final SolrInputDocument doc, final SchemaDeclaration key, final List<?> values) {
assert key.isMultiValued();
assert key.isMultiValued() : "key = " + key.getSolrFieldName();
if ((isEmpty() || contains(key)) && (!this.lazy || (values != null && !values.isEmpty()))) key.add(doc, values);
}
public void add(final SolrInputDocument doc, final SchemaDeclaration key, final int value) {
assert !key.isMultiValued();
assert !key.isMultiValued() : "key = " + key.getSolrFieldName();
if ((isEmpty() || contains(key)) && (!this.lazy || value != 0)) key.add(doc, value);
}
public void add(final SolrInputDocument doc, final SchemaDeclaration key, final long value) {
assert !key.isMultiValued();
assert !key.isMultiValued() : "key = " + key.getSolrFieldName();
if ((isEmpty() || contains(key)) && (!this.lazy || value != 0)) key.add(doc, value);
}
public void add(final SolrInputDocument doc, final SchemaDeclaration key, final boolean value) {
assert !key.isMultiValued();
if (isEmpty() || contains(key)) key.add(doc, value);
assert !key.isMultiValued() : "key = " + key.getSolrFieldName();
if ((isEmpty() || contains(key)) && (!this.lazy || value)) key.add(doc, value);
}
public static Date getDate(SolrInputDocument doc, final SchemaDeclaration key) {

@ -148,14 +148,16 @@ public class JsonResponseWriter implements QueryResponseWriter {
solitaireTag(writer, stag, value.stringValue());
continue;
}
// some special handling here
if (CollectionSchema.sku.getSolrFieldName().equals(fieldName)) {
String u = value.stringValue();
try {
url = new MultiProtocolURI(u);
String filename = url.getFileName();
solitaireTag(writer, "link", u);
solitaireTag(writer, "file", url.getFileName());
solitaireTag(writer, "file", filename);
// get image license
if (MultiProtocolURI.isImage(filename)) URLLicense.aquireLicense(urlhash, url.toNormalform(true));
} catch (MalformedURLException e) {}
continue;
}
@ -206,9 +208,6 @@ public class JsonResponseWriter implements QueryResponseWriter {
//missing: "code","faviconCode"
}
// get image license
if (url.isImage()) URLLicense.aquireLicense(urlhash, url.toNormalform(true));
// compute snippet from texts
solitaireTag(writer, "path", path.toString());
solitaireTag(writer, "title", title.length() == 0 ? (texts.size() == 0 ? path.toString() : texts.get(0)) : title);

@ -204,7 +204,7 @@ public class Latency {
// for CGI accesses, we double the minimum time
// mostly there is a database access in the background
// which creates a lot of unwanted IO on target site
if (url.isCGI()) waiting = waiting * 2;
if (MultiProtocolURI.isCGI(url.getFileName())) waiting = waiting * 2;
// if we have accessed the domain many times, get slower (the flux factor)
if (!local) waiting += host.flux(waiting);
@ -238,7 +238,7 @@ public class Latency {
// for CGI accesses, we double the minimum time
// mostly there is a database access in the background
// which creates a lot of unwanted IO on target site
if (url.isCGI()) { waiting = waiting * 2; s.append(", isCGI = true -> double"); }
if (MultiProtocolURI.isCGI(url.getFileName())) { waiting = waiting * 2; s.append(", isCGI = true -> double"); }
// if we have accessed the domain many times, get slower (the flux factor)
int flux = host.flux(waiting);

@ -74,7 +74,7 @@ public class ResultImages {
image.height() > 100 &&
image.width() < 1200 &&
image.height() < 1000 &&
!"gif".equals(image.url().getFileExtension())) {
!"gif".equals(MultiProtocolURI.getFileExtension(image.url().getFileName()))) {
// && ((urlString.lastIndexOf(".jpg") != -1)) ||
// ((urlString.lastIndexOf(".png") != -1)){

@ -31,6 +31,7 @@ import java.util.Date;
import java.util.List;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.cora.document.analysis.Classification;
import net.yacy.cora.protocol.HeaderFramework;
@ -94,7 +95,7 @@ public class FileLoader {
}
// create response header
String mime = Classification.ext2mime(url.getFileExtension());
String mime = Classification.ext2mime(MultiProtocolURI.getFileExtension(url.getFileName()));
ResponseHeader responseHeader = new ResponseHeader(200);
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified())));
responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);

@ -70,7 +70,7 @@ public class Response {
// doctype calculation
public static char docType(final MultiProtocolURI url) {
String ext = url.getFileExtension();
String ext = MultiProtocolURI.getFileExtension(url.getFileName());
if (ext == null) return DT_UNKNOWN;
if (ext.equals(".gif")) return DT_IMAGE;
if (ext.equals(".ico")) return DT_IMAGE;
@ -169,7 +169,7 @@ public class Response {
// request and response headers may be zero in case that we process surrogates
this.requestHeader = new RequestHeader();
this.responseHeader = new ResponseHeader(200);
this.responseHeader.put(HeaderFramework.CONTENT_TYPE, Classification.ext2mime(request.url().getFileExtension(), "text/plain")); // tell parser how to handle the content
this.responseHeader.put(HeaderFramework.CONTENT_TYPE, Classification.ext2mime(MultiProtocolURI.getFileExtension(request.url().getFileName()), "text/plain")); // tell parser how to handle the content
if (!request.isEmpty()) this.responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(request.size()));
this.profile = profile;
this.status = QUEUE_STATE_FRESH;
@ -291,7 +291,7 @@ public class Response {
return "dynamic_post";
}
if (url().isCGI()) {
if (MultiProtocolURI.isCGI(MultiProtocolURI.getFileExtension(url().getFileName()))) {
return "dynamic_cgi";
}
@ -390,7 +390,7 @@ public class Response {
if (url().isPOST()) {
return false;
}
if (url().isCGI()) {
if (MultiProtocolURI.isCGI(MultiProtocolURI.getFileExtension(url().getFileName()))) {
return false;
}
@ -541,7 +541,7 @@ public class Response {
if (url().isPOST()) {
return "Dynamic_(POST)";
}
if (url().isCGI()) {
if (MultiProtocolURI.isCGI(MultiProtocolURI.getFileExtension(url().getFileName()))) {
return "Dynamic_(CGI)";
}
}
@ -684,7 +684,7 @@ public class Response {
// CGI access makes the page very individual, and therefore not usable in caches
if (!profile().crawlingQ()) {
if (url().isPOST()) { return "Dynamic_(POST)"; }
if (url().isCGI()) { return "Dynamic_(CGI)"; }
if (MultiProtocolURI.isCGI(MultiProtocolURI.getFileExtension(url().getFileName()))) { return "Dynamic_(CGI)"; }
}
// -authorization cases in request

@ -113,7 +113,7 @@ public class SMBLoader {
}
// create response header
String mime = Classification.ext2mime(url.getFileExtension());
String mime = Classification.ext2mime(MultiProtocolURI.getFileExtension(url.getFileName()));
ResponseHeader responseHeader = new ResponseHeader(200);
responseHeader.put(HeaderFramework.LAST_MODIFIED, HeaderFramework.formatRFC1123(new Date(url.lastModified())));
responseHeader.put(HeaderFramework.CONTENT_TYPE, mime);

@ -10,6 +10,7 @@ import java.util.TreeMap;
import java.util.TreeSet;
import java.util.concurrent.ArrayBlockingQueue;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.federate.yacy.CacheStrategy;
import net.yacy.cora.protocol.ClientIdentification;
import net.yacy.crawler.retrieval.Response;
@ -161,7 +162,7 @@ public class YMarkAutoTagger implements Runnable, Thread.UncaughtExceptionHandle
}
final String clean = YMarkUtil.cleanTagsString(buffer.toString());
if(clean.equals(YMarkEntry.BOOKMARK.TAGS.deflt())) {
return document.getFileExtension();
return MultiProtocolURI.getFileExtension(document.dc_source().getFileName());
}
return clean;
} finally {

@ -153,8 +153,8 @@ public class Document {
return this.languages;
}
public String getFileExtension() {
return this.source.getFileExtension();
public String getFileName() {
return this.source.getFileName();
}
public Map<String, Set<String>> getGenericFacets() {

@ -90,7 +90,7 @@ public class LibraryProvider {
private Dictionary(final String nickname, final String url) {
try {
this.filename = new MultiProtocolURI(url).getFileName();
this.filename = (new MultiProtocolURI(url)).getFileName();
} catch ( final MalformedURLException e ) {
assert false;
}

@ -194,7 +194,7 @@ public final class TextParser {
try {
idioms = parsers(location, mimeType);
} catch (final Parser.Failure e) {
final String errorMsg = "Parser Failure for extension '" + location.getFileExtension() + "' or mimetype '" + mimeType + "': " + e.getMessage();
final String errorMsg = "Parser Failure for extension '" + MultiProtocolURI.getFileExtension(location.getFileName()) + "' or mimetype '" + mimeType + "': " + e.getMessage();
AbstractParser.log.logWarning(errorMsg);
throw new Parser.Failure(errorMsg, location);
}
@ -218,7 +218,7 @@ public final class TextParser {
try {
idioms = parsers(location, mimeType);
} catch (final Parser.Failure e) {
final String errorMsg = "Parser Failure for extension '" + location.getFileExtension() + "' or mimetype '" + mimeType + "': " + e.getMessage();
final String errorMsg = "Parser Failure for extension '" + MultiProtocolURI.getFileExtension(location.getFileName()) + "' or mimetype '" + mimeType + "': " + e.getMessage();
AbstractParser.log.logWarning(errorMsg);
throw new Parser.Failure(errorMsg, location);
}
@ -252,7 +252,7 @@ public final class TextParser {
final InputStream sourceStream
) throws Parser.Failure {
if (AbstractParser.log.isFine()) AbstractParser.log.logFine("Parsing '" + location + "' from stream");
final String fileExt = location.getFileExtension();
final String fileExt = MultiProtocolURI.getFileExtension(location.getFileName());
final String documentCharset = htmlParser.patchCharsetEncoding(charset);
assert parser != null;
@ -272,7 +272,7 @@ public final class TextParser {
final String charset,
final byte[] sourceArray
) throws Parser.Failure {
final String fileExt = location.getFileExtension();
final String fileExt = MultiProtocolURI.getFileExtension(location.getFileName());
if (AbstractParser.log.isFine()) AbstractParser.log.logFine("Parsing " + location + " with mimeType '" + mimeType + "' and file extension '" + fileExt + "' from byte[]");
final String documentCharset = htmlParser.patchCharsetEncoding(charset);
assert !parsers.isEmpty();
@ -312,7 +312,7 @@ public final class TextParser {
if (docs == null) {
if (failedParser.isEmpty()) {
final String errorMsg = "Parsing content with file extension '" + location.getFileExtension() + "' and mimetype '" + mimeType + "' failed.";
final String errorMsg = "Parsing content with file extension '" + fileExt + "' and mimetype '" + mimeType + "' failed.";
//log.logWarning("Unable to parse '" + location + "'. " + errorMsg);
throw new Parser.Failure(errorMsg, location);
}
@ -362,7 +362,7 @@ public final class TextParser {
final Set<Parser> idioms = new HashSet<Parser>(2);
// check extension
String ext = url.getFileExtension();
String ext = MultiProtocolURI.getFileExtension(url.getFileName());
Set<Parser> idiom;
if (ext != null && ext.length() > 0) {
ext = ext.toLowerCase();
@ -428,11 +428,11 @@ public final class TextParser {
* @return an error if the extension is not supported, null otherwise
*/
public static String supportsExtension(final MultiProtocolURI url) {
return supportsExtension(url.getFileExtension().toLowerCase());
return supportsExtension(MultiProtocolURI.getFileExtension(url.getFileName()).toLowerCase());
}
public static String mimeOf(final MultiProtocolURI url) {
return mimeOf(url.getFileExtension());
return mimeOf(MultiProtocolURI.getFileExtension(url.getFileName()));
}
public static String mimeOf(final String ext) {

@ -72,8 +72,9 @@ public class audioTagParser extends AbstractParser implements Parser {
final String charset, final InputStream source)
throws Parser.Failure, InterruptedException {
final String filename = location.getFileName().isEmpty() ? location.toTokens() : MultiProtocolURI.unescape(location.getFileName());
final String fileext = '.'+location.getFileExtension();
String filename = location.getFileName();
final String fileext = '.' + MultiProtocolURI.getFileExtension(filename);
filename = filename.isEmpty() ? location.toTokens() : MultiProtocolURI.unescape(filename);
String mime = mimeType;
// fix mimeType
@ -190,7 +191,7 @@ public class audioTagParser extends AbstractParser implements Parser {
this,
null,
null,
singleList(location.getFileName().isEmpty() ? location.toTokens() : MultiProtocolURI.unescape(location.getFileName())), // title
singleList(filename), // title
"", // author
location.getHost(),
null,

@ -47,7 +47,7 @@ public class genericParser extends AbstractParser implements Parser {
public Document[] parse(final DigestURI location, final String mimeType,
final String charset, final InputStream source1)
throws Parser.Failure, InterruptedException {
String filename = location.getFileName();
final Document[] docs = new Document[]{new Document(
location,
mimeType,
@ -55,7 +55,7 @@ public class genericParser extends AbstractParser implements Parser {
this,
null,
null,
singleList(location.getFileName().isEmpty() ? location.toTokens() : MultiProtocolURI.unescape(location.getFileName())), // title
singleList(filename.isEmpty() ? location.toTokens() : MultiProtocolURI.unescape(filename)), // title
"", // author
location.getHost(),
null,

@ -473,10 +473,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final String href = tagopts.getProperty("href", EMPTY_STRING);
DigestURI url;
if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
final String f = url.getFileName();
final int p = f.lastIndexOf('.');
final String type = (p < 0) ? EMPTY_STRING : f.substring(p + 1);
if (type.equals("png") || type.equals("gif") || type.equals("jpg") || type.equals("jpeg") || type.equals("tiff") || type.equals("tif")) {
final String ext = MultiProtocolURI.getFileExtension(url.getFileName());
if (ext.equals("png") || ext.equals("gif") || ext.equals("jpg") || ext.equals("jpeg") || ext.equals("tiff") || ext.equals("tif")) {
// special handling of such urls: put them to the image urls
final ImageEntry ie = new ImageEntry(url, recursiveParse(text), -1, -1, -1);
addImage(this.images, ie);
@ -656,7 +654,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
String ext;
ArrayList<DigestURI> f = new ArrayList<DigestURI>();
for (final DigestURI url: this.anchors.keySet()) {
ext = url.getFileExtension();
ext = MultiProtocolURI.getFileExtension(url.getFileName());
if (ext == null) continue;
if (ext.equals("swf")) f.add(url);
}
@ -666,7 +664,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public boolean containsFlash() {
String ext;
for (final MultiProtocolURI url: this.anchors.keySet()) {
ext = url.getFileExtension();
ext = MultiProtocolURI.getFileExtension(url.getFileName());
if (ext == null) continue;
if (ext.equals("swf")) return true;
}

@ -99,8 +99,9 @@ public class genericImageParser extends AbstractParser implements Parser {
String author = null;
String keywords = null;
String description = null;
if (mimeType.equals("image/bmp") ||
location.getFileExtension().equalsIgnoreCase("bmp")) {
String filename = location.getFileName();
String ext = MultiProtocolURI.getFileExtension(filename);
if (mimeType.equals("image/bmp") || ext.equalsIgnoreCase("bmp")) {
byte[] b;
try {
b = FileUtils.read(sourceStream);
@ -110,10 +111,7 @@ public class genericImageParser extends AbstractParser implements Parser {
}
final IMAGEMAP imap = bmpParser.parse(b);
ii = parseJavaImage(location, imap.getImage());
} else if (mimeType.equals("image/jpeg") ||
location.getFileExtension().equalsIgnoreCase("jpg") ||
location.getFileExtension().equalsIgnoreCase("jpeg") ||
location.getFileExtension().equalsIgnoreCase("jpe")) {
} else if (mimeType.equals("image/jpeg") || ext.equalsIgnoreCase("jpg") || ext.equalsIgnoreCase("jpeg") || ext.equalsIgnoreCase("jpe")) {
// use the exif parser from
// http://www.drewnoakes.com/drewnoakes.com/code/exif/
// javadoc is at: http://www.drewnoakes.com/drewnoakes.com/code/exif/javadoc/
@ -190,7 +188,7 @@ public class genericImageParser extends AbstractParser implements Parser {
final String infoString = ii.info.toString();
images.put(ii.location, new ImageEntry(location, "", ii.width, ii.height, -1));
if (title == null || title.isEmpty()) title = MultiProtocolURI.unescape(location.getFileName());
if (title == null || title.isEmpty()) title = MultiProtocolURI.unescape(filename);
return new Document[]{new Document(
location,
@ -297,7 +295,7 @@ public class genericImageParser extends AbstractParser implements Parser {
DigestURI uri;
try {
uri = new DigestURI("http://localhost/" + image.getName());
final Document[] document = parser.parse(uri, "image/" + uri.getFileExtension(), "UTF-8", new FileInputStream(image));
final Document[] document = parser.parse(uri, "image/" + MultiProtocolURI.getFileExtension(uri.getFileName()), "UTF-8", new FileInputStream(image));
System.out.println(document[0].toString());
} catch (final MalformedURLException e) {
e.printStackTrace();

@ -33,6 +33,7 @@ import java.util.ArrayList;
import java.util.List;
import java.util.zip.GZIPInputStream;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.document.UTF8;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
@ -64,7 +65,7 @@ public class tarParser extends AbstractParser implements Parser {
final List<Document> docacc = new ArrayList<Document>();
Document[] subDocs = null;
final String ext = url.getFileExtension().toLowerCase();
final String ext = MultiProtocolURI.getFileExtension(url.getFileName()).toLowerCase();
if (ext.equals("gz") || ext.equals("tgz")) {
try {
source = new GZIPInputStream(source);

@ -35,10 +35,10 @@ import java.util.Date;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.LinkedHashMap;
import java.util.LinkedHashSet;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.TreeMap;
import java.util.concurrent.BlockingQueue;
@ -79,6 +79,7 @@ import net.yacy.kelondro.util.Bitfield;
import net.yacy.search.index.Segment;
import net.yacy.search.index.Segment.ReferenceReport;
import net.yacy.search.index.Segment.ReferenceReportCache;
import net.yacy.search.schema.WebgraphConfiguration.Subgraph;
import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
@ -256,8 +257,10 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
add(doc, CollectionSchema.description_words_val, cv);
}
String filename = digestURI.getFileName();
String extension = MultiProtocolURI.getFileExtension(filename);
if (allAttr || contains(CollectionSchema.author)) add(doc, CollectionSchema.author, md.dc_creator());
if (allAttr || contains(CollectionSchema.content_type)) add(doc, CollectionSchema.content_type, Response.doctype2mime(digestURI.getFileExtension(), md.doctype()));
if (allAttr || contains(CollectionSchema.content_type)) add(doc, CollectionSchema.content_type, Response.doctype2mime(extension, md.doctype()));
if (allAttr || contains(CollectionSchema.last_modified)) add(doc, CollectionSchema.last_modified, md.moddate());
if (allAttr || contains(CollectionSchema.wordcount_i)) add(doc, CollectionSchema.wordcount_i, md.wordCount());
@ -274,7 +277,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
// path elements of link
if (allAttr || contains(CollectionSchema.url_paths_sxt)) add(doc, CollectionSchema.url_paths_sxt, digestURI.getPaths());
if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, digestURI.getFileExtension());
if (allAttr || contains(CollectionSchema.url_file_name_s)) add(doc, CollectionSchema.url_file_name_s, filename.toLowerCase().endsWith("." + extension) ? filename.substring(0, filename.length() - extension.length() - 1) : filename);
if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, extension);
if (allAttr || contains(CollectionSchema.imagescount_i)) add(doc, CollectionSchema.imagescount_i, md.limage());
if (allAttr || contains(CollectionSchema.inboundlinkscount_i)) add(doc, CollectionSchema.inboundlinkscount_i, md.llocal());
@ -474,8 +478,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
add(doc, CollectionSchema.fuzzy_signature_unique_b, true); // this must be corrected afterwards!
// path elements of link
String filename = digestURI.getFileName();
String extension = MultiProtocolURI.getFileExtension(filename);
if (allAttr || contains(CollectionSchema.url_paths_sxt)) add(doc, CollectionSchema.url_paths_sxt, digestURI.getPaths());
if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, digestURI.getFileExtension());
if (allAttr || contains(CollectionSchema.url_file_name_s)) add(doc, CollectionSchema.url_file_name_s, filename.toLowerCase().endsWith("." + extension) ? filename.substring(0, filename.length() - extension.length() - 1) : filename);
if (allAttr || contains(CollectionSchema.url_file_ext_s)) add(doc, CollectionSchema.url_file_ext_s, extension);
// get list of all links; they will be shrinked by urls that appear in other fields of the solr schema
Set<DigestURI> inboundLinks = document.inboundLinks();
@ -695,8 +702,8 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
outboundLinks.remove(canonical);
add(doc, CollectionSchema.canonical_s, canonical.toNormalform(false));
// set a flag if this is equal to sku
if (contains(CollectionSchema.canonical_equal_sku_b) && canonical.equals(docurl)) {
add(doc, CollectionSchema.canonical_equal_sku_b, true);
if (contains(CollectionSchema.canonical_equal_sku_b)) {
add(doc, CollectionSchema.canonical_equal_sku_b, canonical.equals(docurl));
}
}
}
@ -784,9 +791,16 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (allAttr || contains(CollectionSchema.inboundlinksnofollowcount_i)) add(doc, CollectionSchema.inboundlinksnofollowcount_i, document.inboundLinkNofollowCount());
if (allAttr || contains(CollectionSchema.outboundlinkscount_i)) add(doc, CollectionSchema.outboundlinkscount_i, outboundLinks.size());
if (allAttr || contains(CollectionSchema.outboundlinksnofollowcount_i)) add(doc, CollectionSchema.outboundlinksnofollowcount_i, document.outboundLinkNofollowCount());
Map<DigestURI, Properties> alllinks = document.getAnchors();
// create a subgraph
Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size());
//if () {
webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, true, inboundLinks, citations);
webgraph.addEdges(subgraph, digestURI, responseHeader, collections, clickdepth, alllinks, images, false, outboundLinks, citations);
//}
// list all links
WebgraphConfiguration.Subgraph subgraph = webgraph.edges(digestURI, responseHeader, collections, clickdepth, document.getAnchors(), images, inboundLinks, outboundLinks, citations);
doc.webgraphDocuments.addAll(subgraph.edges);
if (allAttr || contains(CollectionSchema.inboundlinks_protocol_sxt)) add(doc, CollectionSchema.inboundlinks_protocol_sxt, protocolList2indexedList(subgraph.urlProtocols[0]));
if (allAttr || contains(CollectionSchema.inboundlinks_urlstub_txt)) add(doc, CollectionSchema.inboundlinks_urlstub_txt, subgraph.urlStubs[0]);
@ -1164,8 +1178,11 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (contains(CollectionSchema.load_date_dt)) add(solrdoc, CollectionSchema.load_date_dt, new Date());
// path elements of link
String filename = digestURI.getFileName();
String extension = MultiProtocolURI.getFileExtension(filename);
if (contains(CollectionSchema.url_paths_sxt)) add(solrdoc, CollectionSchema.url_paths_sxt, digestURI.getPaths());
if (contains(CollectionSchema.url_file_ext_s)) add(solrdoc, CollectionSchema.url_file_ext_s, digestURI.getFileExtension());
if (contains(CollectionSchema.url_file_name_s)) add(solrdoc, CollectionSchema.url_file_name_s, filename.toLowerCase().endsWith("." + extension) ? filename.substring(0, filename.length() - extension.length() - 1) : filename);
if (contains(CollectionSchema.url_file_ext_s)) add(solrdoc, CollectionSchema.url_file_ext_s, extension);
// fail reason and status
if (contains(CollectionSchema.failreason_s)) add(solrdoc, CollectionSchema.failreason_s, failReason);

@ -152,8 +152,9 @@ public enum CollectionSchema implements SchemaDeclaration {
publisher_url_s(SolrType.string, true, true, false, false, false, "publisher url as defined in http://support.google.com/plus/answer/1713826?hl=de"),
url_protocol_s(SolrType.string, true, true, false, false, false, "the protocol of the url"),
url_paths_sxt(SolrType.string, true, true, true, false, true, "all path elements in the url"),
url_file_name_s(SolrType.string, true, true, false, false, false, "the file name (which is the string after the last '/' and before the query part from '?' on) without the file extension"),
url_file_ext_s(SolrType.string, true, true, false, false, false, "the file name extension"),
url_paths_sxt(SolrType.string, true, true, true, false, true, "all path elements in the url hpath (see: http://www.ietf.org/rfc/rfc1738.txt) without the file name"),
url_parameter_i(SolrType.num_integer, true, true, false, false, false, "number of key-value pairs in search part of the url"),
url_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url"),
url_parameter_value_sxt(SolrType.string, true, true, true, false, false, "the values from key-value pairs in the search part of the url"),

@ -42,6 +42,7 @@ import org.apache.solr.common.SolrDocument;
import org.apache.solr.common.SolrInputDocument;
import net.yacy.cora.document.ASCII;
import net.yacy.cora.document.MultiProtocolURI;
import net.yacy.cora.federate.solr.ProcessType;
import net.yacy.cora.federate.solr.SchemaConfiguration;
import net.yacy.cora.federate.solr.SchemaDeclaration;
@ -111,31 +112,13 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
}
}
public Subgraph edges(
final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth_source,
final Map<DigestURI, Properties> alllinks,
final Map<DigestURI, ImageEntry> images,
final Set<DigestURI> inboundLinks,
final Set<DigestURI> outboundLinks,
IndexCell<CitationReference> citations
) {
boolean allAttr = this.isEmpty();
Subgraph subgraph = new Subgraph(inboundLinks.size(), outboundLinks.size());
addEdges(
subgraph, source, responseHeader, collections, clickdepth_source,
allAttr, alllinks, images, true, inboundLinks, citations);
addEdges(
subgraph, source, responseHeader, collections, clickdepth_source,
allAttr, alllinks, images, false, outboundLinks, citations);
return subgraph;
}
private void addEdges(
public void addEdges(
final Subgraph subgraph,
final DigestURI source, final ResponseHeader responseHeader, String[] collections, int clickdepth_source,
final boolean allAttr, final Map<DigestURI, Properties> alllinks, final Map<DigestURI, ImageEntry> images,
final Map<DigestURI, Properties> alllinks, final Map<DigestURI, ImageEntry> images,
final boolean inbound, final Set<DigestURI> links,
final IndexCell<CitationReference> citations) {
boolean allAttr = this.isEmpty();
for (final DigestURI target_url: links) {
Set<ProcessType> processTypes = new LinkedHashSet<ProcessType>();
@ -194,7 +177,12 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
if (allAttr || contains(WebgraphSchema.source_host_organizationdnc_s)) add(edge, WebgraphSchema.source_host_organizationdnc_s, orga + '.' + dnc);
if (allAttr || contains(WebgraphSchema.source_host_subdomain_s)) add(edge, WebgraphSchema.source_host_subdomain_s, subdom);
}
if (allAttr || contains(WebgraphSchema.source_file_ext_s)) add(edge, WebgraphSchema.source_file_ext_s, source.getFileExtension());
if (allAttr || contains(WebgraphSchema.source_file_ext_s) || contains(WebgraphSchema.source_file_name_s)) {
String source_file_name = source.getFileName();
String source_file_ext = MultiProtocolURI.getFileExtension(source_file_name);
add(edge, WebgraphSchema.source_file_name_s, source_file_name.toLowerCase().endsWith("." + source_file_ext) ? source_file_name.substring(0, source_file_name.length() - source_file_ext.length() - 1) : source_file_name);
add(edge, WebgraphSchema.source_file_ext_s, source_file_ext);
}
if (allAttr || contains(WebgraphSchema.source_path_s)) add(edge, WebgraphSchema.source_path_s, source.getPath());
if (allAttr || contains(WebgraphSchema.source_path_folders_count_i) || contains(WebgraphSchema.source_path_folders_sxt)) {
String[] paths = source.getPaths();
@ -251,7 +239,12 @@ public class WebgraphConfiguration extends SchemaConfiguration implements Serial
if (allAttr || contains(WebgraphSchema.target_host_organizationdnc_s)) add(edge, WebgraphSchema.target_host_organizationdnc_s, orga + '.' + dnc);
if (allAttr || contains(WebgraphSchema.target_host_subdomain_s)) add(edge, WebgraphSchema.target_host_subdomain_s, subdom);
}
if (allAttr || contains(WebgraphSchema.target_file_ext_s)) add(edge, WebgraphSchema.target_file_ext_s, target_url.getFileExtension());
if (allAttr || contains(WebgraphSchema.target_file_ext_s) || contains(WebgraphSchema.target_file_name_s)) {
String target_file_name = target_url.getFileName();
String target_file_ext = MultiProtocolURI.getFileExtension(target_file_name);
add(edge, WebgraphSchema.target_file_name_s, target_file_name.toLowerCase().endsWith("." + target_file_ext) ? target_file_name.substring(0, target_file_name.length() - target_file_ext.length() - 1) : target_file_name);
add(edge, WebgraphSchema.target_file_ext_s, target_file_ext);
}
if (allAttr || contains(WebgraphSchema.target_path_s)) add(edge, WebgraphSchema.target_path_s, target_url.getPath());
if (allAttr || contains(WebgraphSchema.target_path_folders_count_i) || contains(WebgraphSchema.target_path_folders_sxt)) {
String[] paths = target_url.getPaths();

@ -41,11 +41,12 @@ public enum WebgraphSchema implements SchemaDeclaration {
source_id_s(SolrType.string, true, true, false, false, false, "primary key of document, the URL hash (source)"),
source_protocol_s(SolrType.string, true, true, false, false, false, "the protocol of the url (source)"),
source_urlstub_s(SolrType.string, true, true, false, false, false, "the url without the protocol (source)"),
source_file_name_s(SolrType.string, true, true, false, false, false, "the file name without the extension (source)"),
source_file_ext_s(SolrType.string, true, true, false, false, false, "the file name extension (source)"),
source_chars_i(SolrType.num_integer, true, true, false, false, false, "number of all characters in the url (source)"),
source_path_s(SolrType.string, true, true, false, false, false, "path of the url (source)"),
source_path_folders_count_i(SolrType.num_integer, true, true, false, false, false, "count of all path elements in the url (source)"),
source_path_folders_sxt(SolrType.string, true, true, true, false, false, "all path elements in the url (source)"),
source_path_folders_sxt(SolrType.string, true, true, true, false, false, "all path elements in the url without the file name (source)"),
source_parameter_count_i(SolrType.num_integer, true, true, false, false, false, "number of key-value pairs in search part of the url (source)"),
source_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (source)"),
source_parameter_value_sxt(SolrType.string, true, true, true, false, false, "the values from key-value pairs in the search part of the url (source)"),
@ -73,11 +74,12 @@ public enum WebgraphSchema implements SchemaDeclaration {
target_id_s(SolrType.string, true, true, false, false, false, "primary key of document, the URL hash (target)"),
target_protocol_s(SolrType.string, true, true, false, false, false, "the protocol of the url (target)"),
target_urlstub_s(SolrType.string, true, true, false, false, false, "the url without the protocol (target)"),
target_file_name_s(SolrType.string, true, true, false, false, false, "the file name without the extension (target)"),
target_file_ext_s(SolrType.string, true, true, false, false, true, "the file name extension (target)"),
target_chars_i(SolrType.num_integer, true, true, false, false, false, "number of all characters in the url (target)"),
target_path_s(SolrType.string, true, true, false, false, false, "path of the url (target)"),
target_path_folders_count_i(SolrType.num_integer, true, true, false, false, false, "count of all path elements in the url (target)"),
target_path_folders_sxt(SolrType.string, true, true, true, false, true, "all path elements in the url (target)"),
target_path_folders_sxt(SolrType.string, true, true, true, false, true, "all path elements in the url without the file name (target)"),
target_parameter_count_i(SolrType.num_integer, true, true, false, false, false, "number of key-value pairs in search part of the url (target)"),
target_parameter_key_sxt(SolrType.string, true, true, true, false, false, "the keys from key-value pairs in the search part of the url (target)"),
target_parameter_value_sxt(SolrType.string, true, true, true, false, true, "the values from key-value pairs in the search part of the url (target)"),

Loading…
Cancel
Save