Merge branch 'master' of github.com:yacy/yacy_search_server

10 years ago · a44cc774d0
parent 3cbf86f295 7a64bebb86
commit a44cc774d0
8 changed files with 71 additions and 50 deletions
--- a/htroot/Table_API_p.html
+++ b/htroot/Table_API_p.html
@ -85,7 +85,7 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
          <td valign="top" align="left"><input type="checkbox" name="item_#[count]#" value="mark_#[pk]#" /></td>
          <td valign="top">#[type]#
          #(isCrawlerStart)#::<br/><br/>
-          <a href="#[url]#" title="clone"><img src="env/grafics/doc.gif"><img src="env/grafics/right.gif"><img src="env/grafics/doc.gif"></a>::<br/><br/>
+          <a href="#[url]#" title="clone" target="_parent"><img src="env/grafics/doc.gif"><img src="env/grafics/right.gif"><img src="env/grafics/doc.gif"></a>::<br/><br/>
         
         <script>
           var f = document.createElement("form");
@ -93,7 +93,9 @@ To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de
           f.setAttribute("enctype", "multipart/form-data");
           f.setAttribute("accept-charset", "UTF-8");
           f.setAttribute("action", "#[servlet]#");
+           f.setAttribute("target", "_parent");
           f.setAttribute("id", "#[pk]#");
+           f.setAttribute("name", "#[pk]#");
           #{attr}#
           var e = document.createElement("input");
           e.setAttribute("type", "hidden");
--- a/htroot/yacysearchitem.java
+++ b/htroot/yacysearchitem.java
@ -316,7 +316,7 @@ public class yacysearchitem {

                final String license = URLLicense.aquireLicense(image.imageUrl); // this is just the license key to get the image forwarded through the YaCy thumbnail viewer, not an actual lawful license
                //sb.loader.loadIfNotExistBackground(image.imageUrl, 1024 * 1024 * 10, null, ClientIdentification.yacyIntranetCrawlerAgent);
-                prop.putHTML("content_item_hrefCache", "ViewImage." + ("gif.png.svg".contains(imageUrlExt) ? imageUrlExt : "png") + "?maxwidth=128&maxheight=128&code="+license+"&isStatic=true&quadratic=&url=" + imageUrlstring);
+                prop.putHTML("content_item_hrefCache", "ViewImage." + (!imageUrlExt.isEmpty() && "gif.png.svg".contains(imageUrlExt) ? imageUrlExt : "png") + "?maxwidth=128&maxheight=128&code="+license+"&isStatic=true&quadratic=&url=" + imageUrlstring);
                prop.putHTML("content_item_href", imageUrlstring);
                prop.putHTML("content_item_target", target);
                prop.put("content_item_code", license);
--- a/source/net/yacy/cora/document/id/MultiProtocolURL.java
+++ b/source/net/yacy/cora/document/id/MultiProtocolURL.java
@ -945,6 +945,14 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
        return this.searchpart;
    }

+    /**
+     * Returns a search part parameter map  key=value
+     * in internal url encoded format
+     * for unescaped return values
+     * @see #getAttributes()
+     *
+     * @return key name  value
+     */
    public Map<String, String> getSearchpartMap() {
        if (this.searchpart == null) return null;
        this.searchpart = this.searchpart.replaceAll("&amp;", "&");
@ -1027,6 +1035,10 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU

    /**
     * Evaluates url search part and returns attribute '=' value pairs
+     * the returned values are in clear text (without urlencoding).
+     * 
+     * To get the parameter map as (url-encoded key and values)
+     * @see getSearchpartMap()
     *
     * @return map key=attribue name, value=string after '='
     */
@ -1037,9 +1049,9 @@ public class MultiProtocolURL implements Serializable, Comparable<MultiProtocolU
        for (final String element : questp) {
            int p = element.indexOf('=');
            if (p != -1) {
-                map.put(element.substring(0, p), element.substring(p + 1));
+                map.put(unescape(element.substring(0, p)), unescape(element.substring(p + 1)));
            } else {
-                if (!element.isEmpty()) map.put(element, "");
+                if (!element.isEmpty()) map.put(unescape(element), "");
            }
        }
        return map;
--- a/source/net/yacy/crawler/RecrawlBusyThread.java
+++ b/source/net/yacy/crawler/RecrawlBusyThread.java
@ -35,6 +35,7 @@ import net.yacy.crawler.data.NoticedURL;
 import net.yacy.crawler.retrieval.Request;
 import net.yacy.kelondro.workflow.AbstractBusyThread;
 import net.yacy.search.Switchboard;
+import net.yacy.search.SwitchboardConstants;
 import net.yacy.search.schema.CollectionSchema;
 import org.apache.solr.common.SolrDocument;
 import org.apache.solr.common.SolrDocumentList;
@ -53,10 +54,11 @@ public class RecrawlBusyThread extends AbstractBusyThread {
    private String currentQuery = CollectionSchema.fresh_date_dt.getSolrFieldName()+":[* TO NOW/DAY-1DAY]"; // current query
    private boolean includefailed = false; // flag if docs with httpstatus_i <> 200 shall be recrawled
    private int chunkstart = 0;
-    private int chunksize = 200;
+    private final int chunksize;
    final Switchboard sb;
    private final Set<DigestURL> urlstack; // buffer of urls to recrawl
    public long urlsfound = 0;
+    private String solrSortBy;

    public RecrawlBusyThread(Switchboard xsb) {
        super(3000, 1000); // set lower limits of cycle delay
@ -66,6 +68,10 @@ public class RecrawlBusyThread extends AbstractBusyThread {

        this.sb = xsb;
        urlstack = new HashSet<DigestURL>();
+        // workaround to prevent solr exception on existing index (not fully reindexed) since intro of schema with docvalues
+        // org.apache.solr.core.SolrCore java.lang.IllegalStateException: unexpected docvalues type NONE for field 'load_date_dt' (expected=NUMERIC). Use UninvertingReader or index with docvalues.
+        solrSortBy = null; // CollectionSchema.load_date_dt.getSolrFieldName() + " asc";
+        this.chunksize = sb.getConfigInt(SwitchboardConstants.CRAWLER_THREADS_ACTIVE_MAX, 200);
    }

    /**
@ -142,8 +148,8 @@ public class RecrawlBusyThread extends AbstractBusyThread {
     */
    @Override
    public boolean job() {
-        // other crawls are running, do nothing
-        if (sb.crawlQueues.coreCrawlJobSize() > 0) {
+        // more than chunksize crawls are running, do nothing
+        if (sb.crawlQueues.coreCrawlJobSize() > this.chunksize) {
            return false;
        }

@ -168,7 +174,7 @@ public class RecrawlBusyThread extends AbstractBusyThread {
            try {
                // query all or only httpstatus=200 depending on includefailed flag
                docList = solrConnector.getDocumentListByQuery(this.includefailed ? currentQuery : currentQuery + " AND (" + CollectionSchema.httpstatus_i.name() + ":200)",
-                        CollectionSchema.load_date_dt.getSolrFieldName() + " asc", this.chunkstart, this.chunksize, CollectionSchema.sku.getSolrFieldName());
+                        this.solrSortBy, this.chunkstart, this.chunksize, CollectionSchema.sku.getSolrFieldName());
                this.urlsfound = docList.getNumFound();
            } catch (Throwable e) {
                this.urlsfound = 0;
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -57,7 +57,6 @@ import net.yacy.document.SentenceReader;
 import net.yacy.document.VocabularyScraper;
 import net.yacy.document.parser.htmlParser;
 import net.yacy.document.parser.html.Evaluation.Element;
-import net.yacy.document.parser.images.genericImageParser;
 import net.yacy.kelondro.io.CharBuffer;
 import net.yacy.kelondro.util.FileUtils;
 import net.yacy.kelondro.util.ISO639;
@ -552,12 +551,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
            href = CharacterCoding.html2unicode(href);
            AnchorURL url;
            if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
-                final String ext = MultiProtocolURL.getFileExtension(url.getFileName());
-                if (genericImageParser.SUPPORTED_EXTENSIONS.contains(ext)) {
-                    // special handling of such urls: put them to the image urls
-                    final ImageEntry ie = new ImageEntry(url, recursiveParse(url, tag.content.getChars()), -1, -1, -1);
-                    this.images.add(ie);
-                } else {
                if (followDenied()) {
                    String rel = tag.opts.getProperty("rel", EMPTY_STRING);
                    if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow"; 
@ -569,7 +562,6 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                recursiveParse(url, tag.content.getChars());
                this.anchors.add(url);
            }
-            }
            this.evaluationScores.match(Element.apath, href);
        }
        final String h;
--- a/source/net/yacy/document/parser/html/TransformerWriter.java
+++ b/source/net/yacy/document/parser/html/TransformerWriter.java
@ -291,8 +291,7 @@ public final class TransformerWriter extends Writer {
        }
        if (this.transformer != null && this.transformer.isTag0(tagname)) {
            // this single tag is collected at once here
-            char[] b = new char[0];
-            b = this.transformer.transformTag0(tag, quotechar);
+            char[] b = this.transformer.transformTag0(tag, quotechar);
            return b;
        } else if ((this.scraper != null && this.scraper.isTag1(tagname)) ||
                   (this.transformer != null && this.transformer.isTag1(tagname))) {
--- a/source/net/yacy/document/parser/images/genericImageParser.java
+++ b/source/net/yacy/document/parser/images/genericImageParser.java
@ -65,18 +65,17 @@ import com.drew.metadata.Metadata;
 import com.drew.metadata.Tag;
 import com.drew.metadata.exif.GpsDirectory;

-public class genericImageParser extends AbstractParser implements Parser {
-
 /**
-     * a list of mime types that are supported by this parser class
-     * @see #getSupportedMimeTypes()
+ * Parser for images, bmp and jpeg and all supported by the Java Image I/O API
+ * by default java ImageIO supports bmp, gif, jpg, jpeg, png, wbmp (tif if jai-imageio is in classpath/registered)
+ * http://download.java.net/media/jai-imageio/javadoc/1.1/overview-summary.html
 */
-    public static final Set<String> SUPPORTED_MIME_TYPES = new HashSet<String>();
-    public static final Set<String> SUPPORTED_EXTENSIONS = new HashSet<String>();
-    static {
+public class genericImageParser extends AbstractParser implements Parser {
+
+    public genericImageParser() {
+        super("Generic Image Parser");
+
        SUPPORTED_EXTENSIONS.add("bmp");
-        // by default java ImageIO supports bmp, gif, jpg, jpeg, png, wbmp (tif if jai-imageio is in classpath/registered)
-        // http://download.java.net/media/jai-imageio/javadoc/1.1/overview-summary.html
        SUPPORTED_EXTENSIONS.add("jpe"); // not listed in ImageIO extension but sometimes uses for jpeg
        SUPPORTED_EXTENSIONS.addAll(Arrays.asList(ImageIO.getReaderFileSuffixes()));

@ -85,10 +84,6 @@ public class genericImageParser extends AbstractParser implements Parser {
        SUPPORTED_MIME_TYPES.addAll(Arrays.asList(ImageIO.getReaderMIMETypes()));
    }

-    public genericImageParser() {
-        super("Generic Image Parser");
-    }
-
    @Override
    public Document[] parse(
            final AnchorURL location,
@ -129,6 +124,13 @@ public class genericImageParser extends AbstractParser implements Parser {
            byte[] b;
            try {
                b = FileUtils.read(source);
+                // check jpeg file signature (magic number FF D8 FF)
+                if (b.length < 3
+                        || (b[0] != (byte) 0xFF) // cast to signed byte (-1)
+                        || (b[1] != (byte) 0xD8) //cast to signed byte (-40)
+                        || (b[2] != (byte) 0xFF)) {
+                    throw new Parser.Failure("File has no jpeg signature", location);
+                }
            } catch (final IOException e) {
                ConcurrentLog.logException(e);
                throw new Parser.Failure(e.getMessage(), location);
@ -226,7 +228,7 @@ public class genericImageParser extends AbstractParser implements Parser {
        return SUPPORTED_EXTENSIONS;
    }

-    public static ImageInfo parseJavaImage(
+    private ImageInfo parseJavaImage(
                            final AnchorURL location,
                            final InputStream sourceStream) throws Parser.Failure {
        BufferedImage image = null;
@ -241,7 +243,7 @@ public class genericImageParser extends AbstractParser implements Parser {
        return parseJavaImage(location, image);
    }

-    public static ImageInfo parseJavaImage(
+    private ImageInfo parseJavaImage(
                            final AnchorURL location,
                            final BufferedImage image) {
        final ImageInfo ii = new ImageInfo(location);
@ -278,7 +280,7 @@ public class genericImageParser extends AbstractParser implements Parser {
        return ii;
    }

-    public static class ImageInfo {
+    private class ImageInfo {
        public AnchorURL location;
        public BufferedImage image;
        public StringBuilder info;
--- a/source/net/yacy/search/schema/CollectionConfiguration.java
+++ b/source/net/yacy/search/schema/CollectionConfiguration.java
@ -917,14 +917,22 @@ public class CollectionConfiguration extends SchemaConfiguration implements Seri
        if (document.getContentDomain() == ContentDomain.IMAGE) {
            // add image pixel size if known
            Iterator<ImageEntry> imgit = document.getImages().values().iterator();
-            if (imgit.hasNext()) {
+            List<Integer> heights = new ArrayList<>();
+            List<Integer> widths = new ArrayList<>();
+            List<Integer> pixels = new ArrayList<>();
+            while (imgit.hasNext()) {
                ImageEntry img = imgit.next();
                int imgpixels = (img.height() < 0 || img.width() < 0) ? -1 : img.height() * img.width();
-                if (imgpixels > 0) {
-                    if (allAttr || contains(CollectionSchema.images_height_val)) add(doc, CollectionSchema.images_height_val, img.height());
-                    if (allAttr || contains(CollectionSchema.images_width_val)) add(doc, CollectionSchema.images_width_val, img.width());
-                    if (allAttr || contains(CollectionSchema.images_pixel_val)) add(doc, CollectionSchema.images_pixel_val, imgpixels);
+                if (imgpixels > 0 && (allAttr || (contains(CollectionSchema.images_height_val) && contains(CollectionSchema.images_width_val) && contains(CollectionSchema.images_pixel_val)))) {
+                    heights.add(img.height());
+                    widths.add(img.width());
+                    pixels.add(imgpixels);
+                }
            }
+            if (heights.size() > 0) {
+                add(doc, CollectionSchema.images_height_val, heights);
+                add(doc, CollectionSchema.images_width_val, widths);
+                add(doc, CollectionSchema.images_pixel_val, pixels);
            }

            if (allAttr || contains(CollectionSchema.images_text_t))  {