Merge branch 'master' of git@gitorious.org:yacy/rc1.git

pull/1/head
orbiter 11 years ago
commit fec673c9d1

@ -26,19 +26,28 @@
<dd>#[count]#</dd>
<dt>Data</dt>
<dd><input name="data-#[count]#" type="file"></dd>
<dd>data-#[count]#=<input name="data-#[count]#" type="file"></dd>
<dt>URL</dt>
<dd><input name="url-#[count]#" type="text" value="http://nowhere.cc/example.txt" size="80" maxlength="512"></dd>
<dd>url-#[count]#=<input name="url-#[count]#" type="text" value="http://nowhere.cc/example.txt" size="80" maxlength="512"></dd>
<dt>Collection</dt>
<dd>collection-#[count]#=<input name="collection-#[count]#" type="text" value="push" size="80" maxlength="512"></dd>
<dt>Last-Modified</dt><!-- see: http://tools.ietf.org/html/rfc2616#section-14.29 -->
<dd><input name="lastModified-#[count]#" type="text" value="Tue, 15 Nov 1994 12:45:26 GMT" size="30" maxlength="40"></dd>
<!--<dd><input name="lastModified-#[count]#" type="text" value="Tue, 15 Nov 1994 12:45:26 GMT" size="30" maxlength="40"></dd>-->
<dd>responseHeader-#[count]#=<input name="responseHeader-#[count]#" type="text" value="Last-Modified:Tue, 15 Nov 1994 12:45:26 GMT" size="80" maxlength="80"></dd>
<dt>Content-Type</dt><!-- see: http://www.iana.org/assignments/media-types/media-types.xhtml -->
<dd><input name="contentType-#[count]#" type="text" value="text/plain" size="30" maxlength="80"></dd>
<!--<dd><input name="contentType-#[count]#" type="text" value="text/plain" size="30" maxlength="80"></dd>-->
<dd>responseHeader-#[count]#=<input name="responseHeader-#[count]#" type="text" value="Content-Type:text/plain" size="80" maxlength="80"></dd>
<dt></dt><dd>The following attributes are only used for media type content</dd>
<dt>Media-Title</dt>
<dd>responseHeader-#[count]#=<input name="responseHeader-#[count]#" type="text" value="X-YaCy-Media-Title:Hello Pictureworld" size="80" maxlength="200"></dd>
<dt>Collection</dt>
<dd><input name="collection-#[count]#" type="text" value="push" size="30" maxlength="512"></dd>
<dt>Media-Keywords ()</dt>
<dd>responseHeader-#[count]#=<input name="responseHeader-#[count]#" type="text" value="X-YaCy-Media-Keywords:uno dos tres cuatro cinco" size="80" maxlength="200"></dd>
</dl>
</dd>
#{/input}#

@ -83,6 +83,15 @@ public class push_p {
responseHeader.put(HeaderFramework.LAST_MODIFIED, lastModified);
responseHeader.put(HeaderFramework.CONTENT_TYPE, contentType);
responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(data.length));
// add generic fields
String[] responseHeaderMap = post.getParams("responseHeader-" + i); // strings with key-value pairs; separated by ':'
for (String kv: responseHeaderMap) {
int p = kv.indexOf(':');
if (p < 0) continue;
String key = kv.substring(0, p).trim();
String value = kv.substring(p + 1).trim();
responseHeader.put(key, value);
}
CrawlProfile profile = sb.crawler.getPushCrawlProfile(collection);
// create requests and artificial response

@ -58,6 +58,7 @@ import net.yacy.cora.protocol.ftp.FTPClient;
import net.yacy.cora.protocol.http.HTTPClient;
import net.yacy.cora.util.CommonPattern;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.crawler.retrieval.Response;
import net.yacy.document.parser.html.CharacterCoding;
/**
@ -1040,7 +1041,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
}
public static final boolean isImage(final String extension) {
return extension != null && extension.length() > 0 && "png.gif.jpg.jpeg.tif.tiff.ico".indexOf(extension.toLowerCase()) >= 0;
return extension != null && extension.length() > 0 && Response.docTypeExt(extension.toLowerCase()) == Response.DT_IMAGE;
}
public final boolean isIndividual() {

@ -107,10 +107,12 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
public static final String X_ROBOTS_TAG = "X-Robots-Tag"; // see http://googleblog.blogspot.com/2007/07/robots-exclusion-protocol-now-with-even.html
public static final String X_ROBOTS = "X-Robots";
public static final String X_YACY_INDEX_CONTROL = "X-YACY-Index-Control";
public static final String X_YACY_INDEX_CONTROL = "X-YaCy-Index-Control";
//public static final String X_YACY_PREVIOUS_REQUEST_LINE = "X-Previous-Request-Line";
public static final String X_YACY_KEEP_ALIVE_REQUEST_COUNT = "X-Keep-Alive-Request-Count";
public static final String X_YACY_ORIGINAL_REQUEST_LINE = "X-Original-Request-Line";
public static final String X_YACY_MEDIA_TITLE = "X-YaCy-Media-Title"; // can be attached to media files which do not have metadata; this will be used as title
public static final String X_YACY_MEDIA_KEYWORDS = "X-YaCy-Media-Keywords"; // can be attached to media files which do not have metadata; this will be used as keywords (space-separared list of words)
public static final String SET_COOKIE = "Set-Cookie";
public static final String SET_COOKIE2 = "Set-Cookie2";

@ -279,7 +279,7 @@ public class TimeoutRequest<E> {
} }
}).call(timeout).longValue();
} catch (final ExecutionException e) {
throw new IOException(e.getMessage());
throw new IOException(file.toString() + ":" + e.getMessage());
}
}

@ -348,6 +348,8 @@ public class HostBalancer implements Balancer {
return request;
} catch (ConcurrentModificationException e) {
continue tryagain;
} catch (IOException e) {
throw e;
} catch (Throwable e) {
throw new IOException(e.getMessage());
}

@ -69,35 +69,64 @@ public class Response {
private int status; // tracker indexing status, see status defs below
private final boolean fromCache;
// doctype calculation
/**
* doctype calculation by file extension
* TODO: this must be enhanced with a more generic way of configuration
* @param ext
* @return a character denoting the file type
*/
public static char docTypeExt(final String ext) {
if (ext == null) return DT_UNKNOWN;
if (ext.equals("gif")) return DT_IMAGE;
if (ext.equals("ico")) return DT_IMAGE;
if (ext.equals("bmp")) return DT_IMAGE;
if (ext.equals("jpg")) return DT_IMAGE;
if (ext.equals("jpeg")) return DT_IMAGE;
if (ext.equals("png")) return DT_IMAGE;
if (ext.equals("tif")) return DT_IMAGE;
if (ext.equals("tiff")) return DT_IMAGE;
if (ext.equals("htm")) return DT_HTML;
if (ext.equals("html")) return DT_HTML;
if (ext.equals("txt")) return DT_TEXT;
if (ext.equals("doc")) return DT_DOC;
if (ext.equals("rtf")) return DT_DOC;
if (ext.equals("pdf")) return DT_PDFPS;
if (ext.equals("ps")) return DT_PDFPS;
if (ext.equals("mp3")) return DT_AUDIO;
if (ext.equals("aac")) return DT_AUDIO;
if (ext.equals("m4a")) return DT_AUDIO;
if (ext.equals("ogg")) return DT_AUDIO;
if (ext.equals("wav")) return DT_AUDIO;
if (ext.equals("wma")) return DT_AUDIO;
if (ext.equals("avi")) return DT_MOVIE;
if (ext.equals("mov")) return DT_MOVIE;
if (ext.equals("qt")) return DT_MOVIE;
if (ext.equals("mpg")) return DT_MOVIE;
if (ext.equals("mp4")) return DT_MOVIE;
if (ext.equals("m4v")) return DT_MOVIE;
if (ext.equals("mkv")) return DT_MOVIE;
if (ext.equals("md5")) return DT_SHARE;
if (ext.equals("mpeg")) return DT_MOVIE;
if (ext.equals("asf")) return DT_FLASH;
return DT_UNKNOWN;
}
/**
* doctype calculation based on file extensions; this is the url wrapper
* @param url
* @return a character denoting the file type
*/
public static char docType(final MultiProtocolURL url) {
String ext = MultiProtocolURL.getFileExtension(url.getFileName());
if (ext == null) return DT_UNKNOWN;
if (ext.equals(".gif")) return DT_IMAGE;
if (ext.equals(".ico")) return DT_IMAGE;
if (ext.equals(".bmp")) return DT_IMAGE;
if (ext.equals(".jpg")) return DT_IMAGE;
if (ext.equals(".jpeg")) return DT_IMAGE;
if (ext.equals(".png")) return DT_IMAGE;
if (ext.equals(".tif")) return DT_IMAGE;
if (ext.equals(".tiff")) return DT_IMAGE;
if (ext.equals(".htm")) return DT_HTML;
if (ext.equals(".html")) return DT_HTML;
if (ext.equals(".txt")) return DT_TEXT;
if (ext.equals(".doc")) return DT_DOC;
if (ext.equals(".rtf")) return DT_DOC;
if (ext.equals(".pdf")) return DT_PDFPS;
if (ext.equals(".ps")) return DT_PDFPS;
if (ext.equals(".avi")) return DT_MOVIE;
if (ext.equals(".mov")) return DT_MOVIE;
if (ext.equals(".qt")) return DT_MOVIE;
if (ext.equals(".mpg")) return DT_MOVIE;
if (ext.equals(".md5")) return DT_SHARE;
if (ext.equals(".mpeg")) return DT_MOVIE;
if (ext.equals(".asf")) return DT_FLASH;
return DT_UNKNOWN;
return docTypeExt(ext);
}
/**
* doctype calculation based on the mime type
* @param mime
* @return a character denoting the file type
*/
public static char docType(final String mime) {
// serverLog.logFinest("PLASMA", "docType mime=" + mime);
char doctype = DT_UNKNOWN;
@ -120,6 +149,12 @@ public class Response {
return doctype;
}
/**
* reverse mime type calculation; this is just a heuristic
* @param ext
* @param doctype
* @return a mime type string
*/
public static String[] doctype2mime(String ext, char doctype) {
if (doctype == DT_PDFPS) return new String[]{"application/pdf"};
if (doctype == DT_HTML) return new String[]{"text/html"};

@ -92,7 +92,7 @@ public abstract class AbstractParser implements Parser {
public static List<String> singleList(String t) {
List<String> c = new ArrayList<String>(1);
c.add(t);
if (t != null) c.add(t);
return c;
}

@ -817,17 +817,22 @@ dc_rights
final List<AnchorURL> anchors = new ArrayList<AnchorURL>();
final LinkedHashMap<DigestURL, String> rss = new LinkedHashMap<DigestURL, String>();
final LinkedHashMap<AnchorURL, ImageEntry> images = new LinkedHashMap<AnchorURL, ImageEntry>();
final Set<String> languages = new HashSet<String>();
double lon = 0.0d, lat = 0.0d;
Date date = new Date();
String charset = null;
int mindepth = 999;
for (final Document doc: docs) {
if (doc == null) continue;
if (doc == null) continue;
if (charset == null) charset = doc.charset; // TODO: uses this charset for merged content
final String author = doc.dc_creator();
if (author.length() > 0) {
if (authors.length() > 0) authors.append(",");
subjects.append(author);
authors.append(author);
}
final String publisher = doc.dc_publisher();
@ -861,6 +866,7 @@ dc_rights
if (doc.date.before(date)) date = doc.date;
if (doc.getDepth() < mindepth) mindepth = doc.getDepth();
if (doc.dc_language() != null) languages.add(doc.dc_language());
}
// clean up parser data
@ -878,9 +884,9 @@ dc_rights
Document newDoc = new Document(
location,
globalMime,
charset,
null,
null,
null,
languages,
subjects.toString().split(" |,"),
titlesa,
authors.toString(),

@ -109,6 +109,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
h6(TagType.pair),
title(TagType.pair),
b(TagType.pair),
em(TagType.pair),
strong(TagType.pair),
u(TagType.pair),
i(TagType.pair),
@ -563,6 +564,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
} else if ((tag.name.equalsIgnoreCase("strong")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.bold.inc(h);
} else if ((tag.name.equalsIgnoreCase("em")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.bold.inc(h);
} else if ((tag.name.equalsIgnoreCase("i")) && (tag.content.length() < 1024)) {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.italic.inc(h);

@ -149,7 +149,7 @@ public class YaCyDefaultServlet extends HttpServlet {
protected ConcurrentHashMap<File, SoftReference<Method>> templateMethodCache = null;
// settings for multipart/form-data
protected static final File TMPDIR = new File(System.getProperty("java.io.tmpdir"));
protected static final int SIZE_FILE_THRESHOLD = 20 * 1024 * 1024;
protected static final int SIZE_FILE_THRESHOLD = 100 * 1024 * 1024; // 100 MB is a lot but appropriate for multi-document pushed using the push_p.json servlet
protected static final FileItemFactory DISK_FILE_ITEM_FACTORY = new DiskFileItemFactory(SIZE_FILE_THRESHOLD, TMPDIR);
/* ------------------------------------------------------------ */
@Override

@ -367,7 +367,9 @@ public class QueryGoal {
// combine these queries for all relevant fields
q.append(" AND (");
q.append('(').append(CollectionSchema.images_text_t.getSolrFieldName()).append(':').append(w).append("^10.0) OR ");
q.append('(').append(CollectionSchema.images_text_t.getSolrFieldName()).append(':').append(w).append("^100.0) OR ");
q.append('(').append(CollectionSchema.title.getSolrFieldName()).append(':').append(w).append("^50.0) OR ");
q.append('(').append(CollectionSchema.keywords.getSolrFieldName()).append(':').append(w).append("^10.0) OR ");
q.append('(').append(CollectionSchema.text_t.getSolrFieldName()).append(':').append(w).append(')');
q.append(')');

@ -65,6 +65,7 @@ import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement;
import net.yacy.cora.storage.HandleSet;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.cora.util.SpaceExceededException;
import net.yacy.crawler.retrieval.Response;
import net.yacy.data.WorkTables;
import net.yacy.document.Condenser;
import net.yacy.document.LargeNumberCache;
@ -1467,42 +1468,38 @@ public final class SearchEvent {
public ImageResult oneImageResult(final int item, final long timeout) throws MalformedURLException {
if (item < imageViewed.size()) return nthImage(item);
if (imageSpare.size() > 0) return nextSpare();
ResultEntry ms = oneResult(item, timeout);
// check if the match was made in the url or in the image links
if (ms != null) {
SolrDocument doc = ms.getNode();
if (ms == null) throw new MalformedURLException("no image url found");
// try to get more
SolrDocument doc = ms.getNode();
// there can be two different kinds of image hits: either the document itself is an image or images are embedded in the links of text documents.
String mime = (String) doc.getFirstValue(CollectionSchema.content_type.getSolrFieldName());
if (Response.docType(ms.url()) == Response.DT_IMAGE || Response.docType(mime) == Response.DT_IMAGE) {
String id = ASCII.String(ms.hash());
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), ms.url(), "", ms.title(), 0, 0, 0));
} else {
Collection<Object> alt = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName());
Collection<Object> img = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName());
Collection<Object> prt = doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName());
if (img != null) {
int c = 0;
for (Object i: img) {
String a = alt != null && alt.size() > c ? (String) SetTools.nth(alt, c) : "";
if (query.getQueryGoal().matches((String) i) || query.getQueryGoal().matches(a)) {
if (img != null && img.size() > 0) {
for (int c = 0; c < img.size(); c++) {
String image_urlstub = (String) SetTools.nth(img, c);
String image_alt = alt != null && alt.size() > c ? (String) SetTools.nth(alt, c) : "";
if (query.getQueryGoal().matches(image_urlstub) || query.getQueryGoal().matches(image_alt)) {
try {
DigestURL imageUrl = new DigestURL((prt != null && prt.size() > c ? SetTools.nth(prt, c) : "http") + "://" + i);
DigestURL imageUrl = new DigestURL((prt != null && prt.size() > c ? SetTools.nth(prt, c) : "http") + "://" + image_urlstub);
Object heightO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName()), c);
Object widthO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_width_val.getSolrFieldName()), c);
String id = ASCII.String(imageUrl.hash());
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", a, widthO == null ? 0 : (Integer) widthO, heightO == null ? 0 : (Integer) heightO, 0));
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", image_alt, widthO == null ? 0 : (Integer) widthO, heightO == null ? 0 : (Integer) heightO, 0));
} catch (MalformedURLException e) {
continue;
}
}
c++;
}
}
if (MultiProtocolURL.isImage(MultiProtocolURL.getFileExtension(ms.url().getFileName()))) {
String id = ASCII.String(ms.hash());
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), ms.url(), "", ms.title(), 0, 0, 0));
}
if (img != null && img.size() > 0) {
DigestURL imageUrl = new DigestURL((prt != null && prt.size() > 0 ? SetTools.nth(prt, 0) : "http") + "://" + SetTools.nth(img, 0));
String imagetext = alt != null && alt.size() > 0 ? (String) SetTools.nth(alt, 0) : "";
String id = ASCII.String(imageUrl.hash());
if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", imagetext, 0, 0, 0));
}
}
if (imageSpare.size() > 0) return nextSpare();
throw new MalformedURLException("no image url found");

@ -421,9 +421,15 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
}
add(doc, CollectionSchema.collection_sxt, cs);
}
char doctype = Response.docType(responseHeader.getContentType());
List<String> titles = document.titles();
if (allAttr || contains(CollectionSchema.title)) {
if (doctype == Response.DT_IMAGE || doctype == Response.DT_AUDIO || doctype == Response.DT_MOVIE) {
String mediatitle = responseHeader.get(HeaderFramework.X_YACY_MEDIA_TITLE, "");
if (mediatitle.length() > 0) {
if (titles.size() == 0) titles.add(mediatitle); else titles.set(0, mediatitle);
}
}
add(doc, CollectionSchema.title, titles);
if ((allAttr || contains(CollectionSchema.title_exact_signature_l)) && titles.size() > 0) {
add(doc, CollectionSchema.title_exact_signature_l, EnhancedTextProfileSignature.getSignatureLong(titles.get(0)));
@ -473,7 +479,13 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
if (document.getDate().before(lastModified)) lastModified = document.getDate();
add(doc, CollectionSchema.last_modified, lastModified);
}
if (allAttr || contains(CollectionSchema.keywords)) add(doc, CollectionSchema.keywords, document.dc_subject(' '));
if (allAttr || contains(CollectionSchema.keywords)) {
String keywords = document.dc_subject(' ');
if (doctype == Response.DT_IMAGE || doctype == Response.DT_AUDIO || doctype == Response.DT_MOVIE) {
keywords = responseHeader.get(HeaderFramework.X_YACY_MEDIA_KEYWORDS, keywords);
}
add(doc, CollectionSchema.keywords, keywords);
}
if (allAttr || contains(CollectionSchema.synonyms_sxt)) {
List<String> synonyms = condenser.synonyms();
add(doc, CollectionSchema.synonyms_sxt, synonyms);

@ -49,7 +49,7 @@ public class ParserTest {
assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
assertThat(doc.dc_title(), containsString(testFile[2]));
assertThat(doc.dc_creator(), containsString(testFile[3]));
assertThat(doc.dc_description()[0], containsString(testFile[4]));
if (testFile[4].length() > 0) assertThat(doc.dc_description()[0], containsString(testFile[4]));
}
} catch (final InterruptedException ex) {}
}
@ -81,9 +81,9 @@ public class ParserTest {
System.out.println("Parsed " + filename + ": " + str);
assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
// assertThat(doc.dc_title(), containsString(testFile[2]));
assertThat(doc.dc_title(), containsString(testFile[2]));
assertThat(doc.dc_creator(), containsString(testFile[3]));
assertThat(doc.dc_description()[0], containsString(testFile[4]));
if (testFile[4].length() > 0) assertThat(doc.dc_description()[0], containsString(testFile[4]));
}
} catch (final InterruptedException ex) {}
}
@ -115,7 +115,7 @@ public class ParserTest {
assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
assertThat(doc.dc_title(), containsString(testFile[2]));
assertThat(doc.dc_creator(), containsString(testFile[3]));
assertThat(doc.dc_description()[0], containsString(testFile[4]));
if (testFile[4].length() > 0) assertThat(doc.dc_description()[0], containsString(testFile[4]));
}
} catch (final InterruptedException ex) {}
}
@ -147,7 +147,7 @@ public class ParserTest {
assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
assertThat(doc.dc_title(), containsString(testFile[2]));
assertThat(doc.dc_creator(), containsString(testFile[3]));
assertThat(doc.dc_description()[0], containsString(testFile[4]));
if (testFile[4].length() > 0) assertThat(doc.dc_description()[0], containsString(testFile[4]));
}
} catch (final InterruptedException ex) {}
}

Loading…
Cancel
Save