Merge branch 'master' of git@gitorious.org:yacy/rc1.git

11 years ago · fec673c9d1
parent 4a66af716d 2d67f29244
commit fec673c9d1
15 changed files with 147 additions and 68 deletions
--- a/htroot/api/push_p.html
+++ b/htroot/api/push_p.html
@ -26,19 +26,28 @@
 						<dd>#[count]#</dd>
 						
 						<dt>Data</dt>
-						<dd><input name="data-#[count]#" type="file"></dd>
+						<dd>data-#[count]#=<input name="data-#[count]#" type="file"></dd>
 						
 						<dt>URL</dt>
-						<dd><input name="url-#[count]#" type="text" value="http://nowhere.cc/example.txt" size="80" maxlength="512"></dd>
+						<dd>url-#[count]#=<input name="url-#[count]#" type="text" value="http://nowhere.cc/example.txt" size="80" maxlength="512"></dd>
+						
+						<dt>Collection</dt>
+						<dd>collection-#[count]#=<input name="collection-#[count]#" type="text" value="push" size="80" maxlength="512"></dd>
 						
 						<dt>Last-Modified</dt><!-- see: http://tools.ietf.org/html/rfc2616#section-14.29 -->
-						<dd><input name="lastModified-#[count]#" type="text" value="Tue, 15 Nov 1994 12:45:26 GMT" size="30" maxlength="40"></dd>
+						<!--<dd><input name="lastModified-#[count]#" type="text" value="Tue, 15 Nov 1994 12:45:26 GMT" size="30" maxlength="40"></dd>-->
+						<dd>responseHeader-#[count]#=<input name="responseHeader-#[count]#" type="text" value="Last-Modified:Tue, 15 Nov 1994 12:45:26 GMT" size="80" maxlength="80"></dd>
 						
 						<dt>Content-Type</dt><!-- see: http://www.iana.org/assignments/media-types/media-types.xhtml -->
-						<dd><input name="contentType-#[count]#" type="text" value="text/plain" size="30" maxlength="80"></dd>
+						<!--<dd><input name="contentType-#[count]#" type="text" value="text/plain" size="30" maxlength="80"></dd>-->
+						<dd>responseHeader-#[count]#=<input name="responseHeader-#[count]#" type="text" value="Content-Type:text/plain" size="80" maxlength="80"></dd>
+
+						<dt></dt><dd>The following attributes are only used for media type content</dd>
+						<dt>Media-Title</dt>
+						<dd>responseHeader-#[count]#=<input name="responseHeader-#[count]#" type="text" value="X-YaCy-Media-Title:Hello Pictureworld" size="80" maxlength="200"></dd>
 						
-						<dt>Collection</dt>
-						<dd><input name="collection-#[count]#" type="text" value="push" size="30" maxlength="512"></dd>
+						<dt>Media-Keywords ()</dt>
+						<dd>responseHeader-#[count]#=<input name="responseHeader-#[count]#" type="text" value="X-YaCy-Media-Keywords:uno dos tres cuatro cinco" size="80" maxlength="200"></dd>
 					</dl>
 				</dd>
 				#{/input}#
--- a/htroot/api/push_p.java
+++ b/htroot/api/push_p.java
@ -83,6 +83,15 @@ public class push_p {
                responseHeader.put(HeaderFramework.LAST_MODIFIED, lastModified);
                responseHeader.put(HeaderFramework.CONTENT_TYPE, contentType);
                responseHeader.put(HeaderFramework.CONTENT_LENGTH, Long.toString(data.length));
+                // add generic fields
+                String[] responseHeaderMap = post.getParams("responseHeader-" + i); // strings with key-value pairs; separated by ':'
+                for (String kv: responseHeaderMap) {
+                    int p = kv.indexOf(':');
+                    if (p < 0) continue;
+                    String key = kv.substring(0, p).trim();
+                    String value = kv.substring(p + 1).trim();
+                    responseHeader.put(key, value);
+                }
                CrawlProfile profile = sb.crawler.getPushCrawlProfile(collection);
                
                // create requests and artificial response
--- a/source/net/yacy/cora/document/id/MultiProtocolURL.java
+++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java
@ -58,6 +58,7 @@ import net.yacy.cora.protocol.ftp.FTPClient;
 import net.yacy.cora.protocol.http.HTTPClient;
 import net.yacy.cora.util.CommonPattern;
 import net.yacy.cora.util.ConcurrentLog;
+import net.yacy.crawler.retrieval.Response;
 import net.yacy.document.parser.html.CharacterCoding;

 /**
@ -1040,7 +1041,7 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
    }

    public static final boolean isImage(final String extension) {
-        return extension != null && extension.length() > 0 && "png.gif.jpg.jpeg.tif.tiff.ico".indexOf(extension.toLowerCase()) >= 0;
+        return extension != null && extension.length() > 0 && Response.docTypeExt(extension.toLowerCase()) == Response.DT_IMAGE;
    }

    public final boolean isIndividual() {
--- a/source/net/yacy/cora/protocol/HeaderFramework.java
+++ b/source/net/yacy/cora/protocol/HeaderFramework.java
@ -107,10 +107,12 @@ public class HeaderFramework extends TreeMap<String, String> implements Map<Stri
    public static final String X_ROBOTS_TAG = "X-Robots-Tag"; // see http://googleblog.blogspot.com/2007/07/robots-exclusion-protocol-now-with-even.html
    public static final String X_ROBOTS = "X-Robots";

-    public static final String X_YACY_INDEX_CONTROL = "X-YACY-Index-Control";
+    public static final String X_YACY_INDEX_CONTROL = "X-YaCy-Index-Control";
    //public static final String X_YACY_PREVIOUS_REQUEST_LINE = "X-Previous-Request-Line";
    public static final String X_YACY_KEEP_ALIVE_REQUEST_COUNT = "X-Keep-Alive-Request-Count";
    public static final String X_YACY_ORIGINAL_REQUEST_LINE = "X-Original-Request-Line";
+    public static final String X_YACY_MEDIA_TITLE = "X-YaCy-Media-Title"; // can be attached to media files which do not have metadata; this will be used as title
+    public static final String X_YACY_MEDIA_KEYWORDS = "X-YaCy-Media-Keywords"; // can be attached to media files which do not have metadata; this will be used as keywords (space-separared list of words)

    public static final String SET_COOKIE = "Set-Cookie";
    public static final String SET_COOKIE2 = "Set-Cookie2";
--- a/source/net/yacy/cora/protocol/TimeoutRequest.java
+++ b/source/net/yacy/cora/protocol/TimeoutRequest.java
@ -279,7 +279,7 @@ public class TimeoutRequest<E> {
                } }
            }).call(timeout).longValue();
        } catch (final ExecutionException e) {
-            throw new IOException(e.getMessage());
+            throw new IOException(file.toString() + ":" + e.getMessage());
        }
    }

--- a/source/net/yacy/crawler/HostBalancer.java
+++ b/source/net/yacy/crawler/HostBalancer.java
@ -348,6 +348,8 @@ public class HostBalancer implements Balancer {
            return request;
        } catch (ConcurrentModificationException e) {
            continue tryagain;
+        } catch (IOException e) {
+            throw e;
        } catch (Throwable e) {
            throw new IOException(e.getMessage());
        }
--- a/source/net/yacy/crawler/retrieval/Response.java
+++ b/source/net/yacy/crawler/retrieval/Response.java
@ -69,35 +69,64 @@ public class Response {
    private        int                status;          // tracker indexing status, see status defs below
    private final  boolean            fromCache;

-    // doctype calculation
+    /**
+     * doctype calculation by file extension
+     * TODO: this must be enhanced with a more generic way of configuration
+     * @param ext
+     * @return a character denoting the file type
+     */
+    public static char docTypeExt(final String ext) {
+        if (ext == null) return DT_UNKNOWN;
+        if (ext.equals("gif"))  return DT_IMAGE;
+        if (ext.equals("ico"))  return DT_IMAGE;
+        if (ext.equals("bmp"))  return DT_IMAGE;
+        if (ext.equals("jpg"))  return DT_IMAGE;
+        if (ext.equals("jpeg")) return DT_IMAGE;
+        if (ext.equals("png"))  return DT_IMAGE;
+        if (ext.equals("tif"))  return DT_IMAGE;
+        if (ext.equals("tiff")) return DT_IMAGE;
+        if (ext.equals("htm"))  return DT_HTML;
+        if (ext.equals("html")) return DT_HTML;
+        if (ext.equals("txt"))  return DT_TEXT;
+        if (ext.equals("doc"))  return DT_DOC;
+        if (ext.equals("rtf"))  return DT_DOC;
+        if (ext.equals("pdf"))  return DT_PDFPS;
+        if (ext.equals("ps"))   return DT_PDFPS;
+        if (ext.equals("mp3"))  return DT_AUDIO;
+        if (ext.equals("aac"))  return DT_AUDIO;
+        if (ext.equals("m4a"))  return DT_AUDIO;
+        if (ext.equals("ogg"))  return DT_AUDIO;
+        if (ext.equals("wav"))  return DT_AUDIO;
+        if (ext.equals("wma"))  return DT_AUDIO;
+        if (ext.equals("avi"))  return DT_MOVIE;
+        if (ext.equals("mov"))  return DT_MOVIE;
+        if (ext.equals("qt"))   return DT_MOVIE;
+        if (ext.equals("mpg"))  return DT_MOVIE;
+        if (ext.equals("mp4"))  return DT_MOVIE;
+        if (ext.equals("m4v"))  return DT_MOVIE;
+        if (ext.equals("mkv"))  return DT_MOVIE;
+        if (ext.equals("md5"))  return DT_SHARE;
+        if (ext.equals("mpeg")) return DT_MOVIE;
+        if (ext.equals("asf"))  return DT_FLASH;
+        return DT_UNKNOWN;
+    }
+    
+    /**
+     * doctype calculation based on file extensions; this is the url wrapper
+     * @param url
+     * @return a character denoting the file type
+     */
    public static char docType(final MultiProtocolURL url) {
        String ext = MultiProtocolURL.getFileExtension(url.getFileName());
        if (ext == null) return DT_UNKNOWN;
-        if (ext.equals(".gif"))  return DT_IMAGE;
-        if (ext.equals(".ico"))  return DT_IMAGE;
-        if (ext.equals(".bmp"))  return DT_IMAGE;
-        if (ext.equals(".jpg"))  return DT_IMAGE;
-        if (ext.equals(".jpeg")) return DT_IMAGE;
-        if (ext.equals(".png"))  return DT_IMAGE;
-        if (ext.equals(".tif"))  return DT_IMAGE;
-        if (ext.equals(".tiff")) return DT_IMAGE;
-        if (ext.equals(".htm"))  return DT_HTML;
-        if (ext.equals(".html")) return DT_HTML;
-        if (ext.equals(".txt"))  return DT_TEXT;
-        if (ext.equals(".doc"))  return DT_DOC;
-        if (ext.equals(".rtf"))  return DT_DOC;
-        if (ext.equals(".pdf"))  return DT_PDFPS;
-        if (ext.equals(".ps"))   return DT_PDFPS;
-        if (ext.equals(".avi"))  return DT_MOVIE;
-        if (ext.equals(".mov"))  return DT_MOVIE;
-        if (ext.equals(".qt"))   return DT_MOVIE;
-        if (ext.equals(".mpg"))  return DT_MOVIE;
-        if (ext.equals(".md5"))  return DT_SHARE;
-        if (ext.equals(".mpeg")) return DT_MOVIE;
-        if (ext.equals(".asf"))  return DT_FLASH;
-        return DT_UNKNOWN;
+        return docTypeExt(ext);
    }

+    /**
+     * doctype calculation based on the mime type
+     * @param mime
+     * @return a character denoting the file type
+     */
    public static char docType(final String mime) {
        // serverLog.logFinest("PLASMA", "docType mime=" + mime);
        char doctype = DT_UNKNOWN;
@ -120,6 +149,12 @@ public class Response {
        return doctype;
    }

+    /**
+     * reverse mime type calculation; this is just a heuristic
+     * @param ext
+     * @param doctype
+     * @return a mime type string
+     */
    public static String[] doctype2mime(String ext, char doctype) {
        if (doctype == DT_PDFPS) return new String[]{"application/pdf"};
        if (doctype == DT_HTML) return new String[]{"text/html"};
--- a/source/net/yacy/document/AbstractParser.java
+++ b/source/net/yacy/document/AbstractParser.java
@ -92,7 +92,7 @@ public abstract class AbstractParser implements Parser {

    public static List<String> singleList(String t) {
        List<String> c = new ArrayList<String>(1);
-        c.add(t);
+        if (t != null) c.add(t);
        return c;
    }

--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@ -817,17 +817,22 @@ dc_rights
        final List<AnchorURL>       anchors       = new ArrayList<AnchorURL>();
        final LinkedHashMap<DigestURL, String> rss = new LinkedHashMap<DigestURL, String>();
        final LinkedHashMap<AnchorURL, ImageEntry> images = new LinkedHashMap<AnchorURL, ImageEntry>();
+        final Set<String> languages = new HashSet<String>();
        double lon = 0.0d, lat = 0.0d;
        Date date = new Date();
+        String charset = null;

        int mindepth = 999;
        for (final Document doc: docs) {

-        	if (doc == null) continue;
+            if (doc == null) continue;
+
+            if (charset == null) charset = doc.charset; // TODO: uses this charset for merged content
+
            final String author = doc.dc_creator();
            if (author.length() > 0) {
                if (authors.length() > 0) authors.append(",");
-                subjects.append(author);
+                authors.append(author);
            }

            final String publisher = doc.dc_publisher();
@ -861,6 +866,7 @@ dc_rights
            if (doc.date.before(date)) date = doc.date;
            
            if (doc.getDepth() < mindepth) mindepth = doc.getDepth();
+            if (doc.dc_language() != null) languages.add(doc.dc_language());
        }

        // clean up parser data
@ -878,9 +884,9 @@ dc_rights
        Document newDoc = new Document(
                location,
                globalMime,
+                charset,
                null,
-                null,
-                null,
+                languages,
                subjects.toString().split(" |,"),
                titlesa,
                authors.toString(),
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -109,6 +109,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        h6(TagType.pair),
        title(TagType.pair),
        b(TagType.pair),
+        em(TagType.pair),
        strong(TagType.pair),
        u(TagType.pair),
        i(TagType.pair),
@ -563,6 +564,9 @@ public class ContentScraper extends AbstractScraper implements Scraper {
        } else if ((tag.name.equalsIgnoreCase("strong")) && (tag.content.length() < 1024)) {
            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
            if (h.length() > 0) this.bold.inc(h);
+        } else if ((tag.name.equalsIgnoreCase("em")) && (tag.content.length() < 1024)) {
+            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
+            if (h.length() > 0) this.bold.inc(h);
        } else if ((tag.name.equalsIgnoreCase("i")) && (tag.content.length() < 1024)) {
            h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
            if (h.length() > 0) this.italic.inc(h);
--- a/source/net/yacy/http/servlets/YaCyDefaultServlet.java
+++ b/source/net/yacy/http/servlets/YaCyDefaultServlet.java
@ -149,7 +149,7 @@ public class YaCyDefaultServlet extends HttpServlet  {
    protected ConcurrentHashMap<File, SoftReference<Method>> templateMethodCache = null;
    // settings for multipart/form-data
    protected static final File TMPDIR = new File(System.getProperty("java.io.tmpdir"));
-    protected static final int SIZE_FILE_THRESHOLD = 20 * 1024 * 1024;
+    protected static final int SIZE_FILE_THRESHOLD = 100 * 1024 * 1024; // 100 MB is a lot but appropriate for multi-document pushed using the push_p.json servlet
    protected static final FileItemFactory DISK_FILE_ITEM_FACTORY = new DiskFileItemFactory(SIZE_FILE_THRESHOLD, TMPDIR);
    /* ------------------------------------------------------------ */
    @Override
--- a/source/net/yacy/search/query/QueryGoal.java
+++ b/source/net/yacy/search/query/QueryGoal.java
@ -367,7 +367,9 @@ public class QueryGoal {
        
        // combine these queries for all relevant fields
        q.append(" AND (");
-        q.append('(').append(CollectionSchema.images_text_t.getSolrFieldName()).append(':').append(w).append("^10.0) OR ");
+        q.append('(').append(CollectionSchema.images_text_t.getSolrFieldName()).append(':').append(w).append("^100.0) OR ");
+        q.append('(').append(CollectionSchema.title.getSolrFieldName()).append(':').append(w).append("^50.0) OR ");
+        q.append('(').append(CollectionSchema.keywords.getSolrFieldName()).append(':').append(w).append("^10.0) OR ");
        q.append('(').append(CollectionSchema.text_t.getSolrFieldName()).append(':').append(w).append(')');
        q.append(')');

--- a/source/net/yacy/search/query/SearchEvent.java
+++ b/source/net/yacy/search/query/SearchEvent.java
@ -65,6 +65,7 @@ import net.yacy.cora.sorting.WeakPriorityBlockingQueue.ReverseElement;
 import net.yacy.cora.storage.HandleSet;
 import net.yacy.cora.util.ConcurrentLog;
 import net.yacy.cora.util.SpaceExceededException;
+import net.yacy.crawler.retrieval.Response;
 import net.yacy.data.WorkTables;
 import net.yacy.document.Condenser;
 import net.yacy.document.LargeNumberCache;
@ -1467,42 +1468,38 @@ public final class SearchEvent {
    
    public ImageResult oneImageResult(final int item, final long timeout) throws MalformedURLException {
        if (item < imageViewed.size()) return nthImage(item);
-        
+        if (imageSpare.size() > 0) return nextSpare();
        ResultEntry ms = oneResult(item, timeout);
        // check if the match was made in the url or in the image links
-        if (ms != null) {
-            SolrDocument doc = ms.getNode();
+        if (ms == null) throw new MalformedURLException("no image url found");
+        // try to get more
+        SolrDocument doc = ms.getNode();
+        // there can be two different kinds of image hits: either the document itself is an image or images are embedded in the links of text documents.
+        String mime = (String) doc.getFirstValue(CollectionSchema.content_type.getSolrFieldName());
+        if (Response.docType(ms.url()) == Response.DT_IMAGE || Response.docType(mime) == Response.DT_IMAGE) {
+            String id = ASCII.String(ms.hash());
+            if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), ms.url(), "", ms.title(), 0, 0, 0));
+        } else {
            Collection<Object> alt = doc.getFieldValues(CollectionSchema.images_alt_sxt.getSolrFieldName());
            Collection<Object> img = doc.getFieldValues(CollectionSchema.images_urlstub_sxt.getSolrFieldName());
            Collection<Object> prt = doc.getFieldValues(CollectionSchema.images_protocol_sxt.getSolrFieldName());
-            if (img != null) {
-                int c = 0;
-                for (Object i: img) {
-                    String a = alt != null && alt.size() > c ? (String) SetTools.nth(alt, c) : "";
-                    if (query.getQueryGoal().matches((String) i) || query.getQueryGoal().matches(a)) {
+            if (img != null && img.size() > 0) {
+                for (int c = 0; c < img.size(); c++) {
+                    String image_urlstub =  (String) SetTools.nth(img, c);
+                    String image_alt = alt != null && alt.size() > c ? (String) SetTools.nth(alt, c) : "";
+                    if (query.getQueryGoal().matches(image_urlstub) || query.getQueryGoal().matches(image_alt)) {
                        try {
-                            DigestURL imageUrl = new DigestURL((prt != null && prt.size() > c ? SetTools.nth(prt, c) : "http") + "://" + i);
+                            DigestURL imageUrl = new DigestURL((prt != null && prt.size() > c ? SetTools.nth(prt, c) : "http") + "://" + image_urlstub);
                            Object heightO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_height_val.getSolrFieldName()), c);
                            Object widthO = SetTools.nth(doc.getFieldValues(CollectionSchema.images_width_val.getSolrFieldName()), c);
                            String id = ASCII.String(imageUrl.hash());
-                            if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", a, widthO == null ? 0 : (Integer) widthO, heightO == null ? 0 : (Integer) heightO, 0));
+                            if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", image_alt, widthO == null ? 0 : (Integer) widthO, heightO == null ? 0 : (Integer) heightO, 0));
                        } catch (MalformedURLException e) {
                            continue;
                        }
                    }
-                    c++;
                }
            }
-            if (MultiProtocolURL.isImage(MultiProtocolURL.getFileExtension(ms.url().getFileName()))) {
-                String id = ASCII.String(ms.hash());
-                if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), ms.url(), "", ms.title(), 0, 0, 0));
-            }
-            if (img != null && img.size() > 0) {
-                DigestURL imageUrl = new DigestURL((prt != null && prt.size() > 0 ? SetTools.nth(prt, 0) : "http") + "://" + SetTools.nth(img, 0));
-                String imagetext =  alt != null && alt.size() > 0 ? (String) SetTools.nth(alt, 0) : "";
-                String id = ASCII.String(imageUrl.hash());
-                if (!imageViewed.containsKey(id) && !imageSpare.containsKey(id)) imageSpare.put(id, new ImageResult(ms.url(), imageUrl, "", imagetext, 0, 0, 0));
-            }
        }
        if (imageSpare.size() > 0) return nextSpare();
        throw new MalformedURLException("no image url found");
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -421,9 +421,15 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
            }
            add(doc, CollectionSchema.collection_sxt, cs);
        }
-        
+        char doctype = Response.docType(responseHeader.getContentType());
        List<String> titles = document.titles();
        if (allAttr || contains(CollectionSchema.title)) {
+            if (doctype == Response.DT_IMAGE || doctype == Response.DT_AUDIO || doctype == Response.DT_MOVIE) {
+                String mediatitle = responseHeader.get(HeaderFramework.X_YACY_MEDIA_TITLE, "");
+                if (mediatitle.length() > 0) {
+                    if (titles.size() == 0) titles.add(mediatitle); else titles.set(0, mediatitle);
+                }
+            }
            add(doc, CollectionSchema.title, titles);
            if ((allAttr || contains(CollectionSchema.title_exact_signature_l)) && titles.size() > 0) {
                add(doc, CollectionSchema.title_exact_signature_l, EnhancedTextProfileSignature.getSignatureLong(titles.get(0)));
@ -473,7 +479,13 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
            if (document.getDate().before(lastModified)) lastModified = document.getDate();
            add(doc, CollectionSchema.last_modified, lastModified);
        }
-        if (allAttr || contains(CollectionSchema.keywords)) add(doc, CollectionSchema.keywords, document.dc_subject(' '));
+        if (allAttr || contains(CollectionSchema.keywords)) {
+            String keywords = document.dc_subject(' ');
+            if (doctype == Response.DT_IMAGE || doctype == Response.DT_AUDIO || doctype == Response.DT_MOVIE) {
+                keywords = responseHeader.get(HeaderFramework.X_YACY_MEDIA_KEYWORDS, keywords);
+            }
+            add(doc, CollectionSchema.keywords, keywords);
+        }
        if (allAttr || contains(CollectionSchema.synonyms_sxt)) {
            List<String> synonyms = condenser.synonyms();
            add(doc, CollectionSchema.synonyms_sxt, synonyms);
--- a/test/net/yacy/document/ParserTest.java
+++ b/test/net/yacy/document/ParserTest.java
@ -49,7 +49,7 @@ public class ParserTest {
                            assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
                            assertThat(doc.dc_title(), containsString(testFile[2]));
                            assertThat(doc.dc_creator(), containsString(testFile[3]));
-                            assertThat(doc.dc_description()[0], containsString(testFile[4]));
+                            if (testFile[4].length() > 0) assertThat(doc.dc_description()[0], containsString(testFile[4]));
                        }
                    } catch (final InterruptedException ex) {}
                    }
@ -81,9 +81,9 @@ public class ParserTest {

                            System.out.println("Parsed " + filename + ": " + str);
                            assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
-                     //       assertThat(doc.dc_title(), containsString(testFile[2]));
+                            assertThat(doc.dc_title(), containsString(testFile[2]));
                            assertThat(doc.dc_creator(), containsString(testFile[3]));
-                            assertThat(doc.dc_description()[0], containsString(testFile[4]));
+                            if (testFile[4].length() > 0) assertThat(doc.dc_description()[0], containsString(testFile[4]));
                        }
                    } catch (final InterruptedException ex) {}
                    }
@ -115,7 +115,7 @@ public class ParserTest {
                            assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
                            assertThat(doc.dc_title(), containsString(testFile[2]));
                            assertThat(doc.dc_creator(), containsString(testFile[3]));
-                            assertThat(doc.dc_description()[0], containsString(testFile[4]));
+                            if (testFile[4].length() > 0) assertThat(doc.dc_description()[0], containsString(testFile[4]));
                        }
                    } catch (final InterruptedException ex) {}
                    }
@ -147,7 +147,7 @@ public class ParserTest {
                            assertThat(str.toString(), containsString("In München steht ein Hofbräuhaus, dort gibt es Bier in Maßkrügen"));
                            assertThat(doc.dc_title(), containsString(testFile[2]));
                            assertThat(doc.dc_creator(), containsString(testFile[3]));
-                            assertThat(doc.dc_description()[0], containsString(testFile[4]));
+                            if (testFile[4].length() > 0) assertThat(doc.dc_description()[0], containsString(testFile[4]));
                        }
                    } catch (final InterruptedException ex) {}
                    }