enhanced location search:

search is now done using verify=false (instead of verify=cacheonly) which will cause that much more targets can be found. This showed a bug where no location information was used from the metadata (and other metadata information) if cache=false is requested. The bug was fixed. Added also location parsing from wikimedia dumps. A wikipedia dump can now also be a source for a location search. Fixed many smaller bugs in connection with location search. git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7657 6c8d7289-2bf4-0310-a012-ef5d649a1542
14 years ago · 958ff4778e
parent 8d63f3b70f
commit 958ff4778e
10 changed files with 101 additions and 31 deletions
--- a/htroot/yacysearch_location.html
+++ b/htroot/yacysearch_location.html
@ -79,24 +79,23 @@
 </head>
 <body id="yacysearch_location" onload="init();">
-#(display)#
+    #(topmenu)#
    #%env/templates/simpleheader.template%#
    ::
    #%env/templates/header.template%#
    ::
    #%env/templates/embeddedheader.template%#
-#(/display)#
+    ::
 <div id="api">
 <a href="yacysearch_location.rss" id="apilink"><img src="/env/grafics/api.png" width="60" height="40" alt="API"/></a>
 <script type="text/javascript">
 //<![CDATA[
-document.getElementById('apilink').setAttribute('href', 'yacysearch_location.rss?' + window.location.search.substring(1));
+document.getElementById('apilink').setAttribute('href', 'yacysearch_location.rss?dom=metatag|alltext&' + window.location.search.substring(1));
 //]]>
 </script>
 <span>The information that is presented on this page can also be retrieved as XML
 Click the API icon to see the XML.
 To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de/wiki/index.php/Dev:API">API wiki page</a>.</span>
 </div>
    #%env/templates/simpleheader.template%#
    #(/topmenu)#
 <form class="search small" onsubmit="return false;" class="search small" accept-charset="UTF-8">
  <h2>#[promoteSearchPageGreeting]#</h2>
  <div class="yacylogo"><a href="#[promoteSearchPageGreeting.homepage]#" class="yacylogo"><img src="#[promoteSearchPageGreeting.smallImage]#" alt="yacysearch" /></a></div>
--- a/htroot/yacysearch_location.java
+++ b/htroot/yacysearch_location.java
@ -29,7 +29,6 @@ import net.yacy.cora.protocol.RequestHeader;
 import net.yacy.cora.services.federated.opensearch.SRURSSConnector;
 import net.yacy.document.LibraryProvider;
 import net.yacy.document.geolocalization.Location;
 import de.anomic.crawler.CrawlProfile;
 import de.anomic.search.Switchboard;
 import de.anomic.search.SwitchboardConstants;
 import de.anomic.server.serverCore;
@ -94,7 +93,7 @@ public class yacysearch_location {
                // get a queue of search results
                String rssSearchServiceURL = "http://127.0.0.1:" + sb.getConfig("port", "8090") + "/yacysearch.rss";
                BlockingQueue<RSSMessage> results = new LinkedBlockingQueue<RSSMessage>();
-                SRURSSConnector.searchSRURSS(results, rssSearchServiceURL, query, maximumTime, Integer.MAX_VALUE, CrawlProfile.CacheStrategy.NOCACHE, false, null);
+                SRURSSConnector.searchSRURSS(results, rssSearchServiceURL, query, maximumTime, Integer.MAX_VALUE, null, false, null);
                // take the results and compute some locations
                RSSMessage message;
@ -164,10 +163,7 @@ public class yacysearch_location {
        }
        if (header.get(HeaderFramework.CONNECTION_PROP_EXT, "").equals("html")) {
-            final boolean authenticated = sb.adminAuthenticated(header) >= 2;
+            prop.put("topmenu", sb.getConfigBool("publicTopmenu", true) ? 1 : 0);
            int display = (post == null) ? 0 : post.getInt("display", 0);
            if (!authenticated) display = 2;
            prop.put("display", display);
            prop.put("promoteSearchPageGreeting", sb.getConfig(SwitchboardConstants.GREETING, ""));
            prop.put("promoteSearchPageGreeting.homepage", sb.getConfig(SwitchboardConstants.GREETING_HOMEPAGE, ""));
            prop.put("promoteSearchPageGreeting.smallImage", sb.getConfig(SwitchboardConstants.GREETING_SMALL_IMAGE, ""));
--- a/source/de/anomic/crawler/CrawlProfile.java
+++ b/source/de/anomic/crawler/CrawlProfile.java
@ -249,6 +249,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
        IFFRESH(1),    // use the cache if the cache exists and is fresh using the proxy-fresh rules
        IFEXIST(2),    // use the cache if the cache exist. Do no check freshness. Otherwise use online source.
        CACHEONLY(3);  // never go online, use all content from cache. If no cache entry exist, consider content nevertheless as available 
        // the fifth case may be that the CacheStrategy object is assigned NULL. That means that no snippet creation is wanted.
        public int code;
        private CacheStrategy(int code) {
            this.code = code;
--- a/source/de/anomic/data/wiki/WikiCode.java
+++ b/source/de/anomic/data/wiki/WikiCode.java
@ -107,6 +107,8 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
    private static final String WIKI_CLOSE_LINK = "]]";
    private static final String WIKI_OPEN_LINK = "[[";
    private static final String WIKI_CLOSE_METADATA = "}}";
    private static final String WIKI_OPEN_METADATA = "{{";
    private static final String WIKI_CLOSE_EXTERNAL_LINK = "]";
    private static final String WIKI_OPEN_EXTERNAL_LINK = "[";
    private static final String WIKI_CLOSE_PRE_ESCAPED = "&lt;/pre&gt;";
@ -926,6 +928,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
     */
    public String processLineOfWikiCode(String hostport, String line) {
        //If HTML has not been replaced yet (can happen if method gets called in recursion), replace now!
        line = processMetadata(line);
        if ((!replacedHtmlAlready || preformattedSpanning) && line.indexOf(WIKI_CLOSE_PRE_ESCAPED) < 0) {
            line = CharacterCoding.unicode2html(line, true);
            replacedHtmlAlready = true;
@ -974,6 +977,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
            line = tagReplace(line, Tags.STRIKE);
            line = processUnorderedList(line);
            line = processOrderedList(line);
            line = processDefinitionList(line);
@ -991,6 +995,58 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
        return line;
    }
    public String processMetadata(String line) {
        int p, q, s = 0;
        while ((p = line.indexOf(WIKI_OPEN_METADATA, s)) >= 0 && (q = line.indexOf(WIKI_CLOSE_METADATA, p + 1)) >= 0) {
            s = q; // continue with next position
            String a = line.substring(p + 2, q);
            if (a.toLowerCase().startsWith("coordinate")) {
                // parse Geographical Coordinates as described in
                // http://en.wikipedia.org/wiki/Wikipedia:Manual_of_Style_%28dates_and_numbers%29#Geographical_coordinates
                // looks like:
                // {{Coord|57|18|22.5|N|4|27|32.7|W|display=title}}
                // however, such information does not appear as defined above but as:
                // {{coordinate|NS=52.205944|EW=0.117593|region=GB-CAM|type=landmark}}
                // {{coordinate|NS=43/50/29/N|EW=73/23/17/W|type=landmark|region=US-NY}}
                // and if passed through this parser:
                // {{Coordinate |NS 45/37/43.0/N |EW. 07/58/41.0/E |type=landmark |region=IT-BI}} ## means: degree/minute/second
                // {{Coordinate |NS 51.48994 |EW. 7.33249 |type=landmark |region=DE-NW}}
                String b[] = a.split("\\|");
                float lon = 0.0f, lat = 0.0f;
                float lonm = 0.0f, latm = 0.0f;
                String lono = "E", lato = "N";
                String name = "";
                for (String c: b) {
                    if (c.toLowerCase().startsWith("name=")) {
                        name = c.substring(5);
                    }
                    if (c.toUpperCase().startsWith("NS=")) {
                        String d[] = c.substring(3).split("/");
                        if (d.length == 1) {float l = Float.parseFloat(d[0]); if (l < 0) {lato = "S"; l = -l;} lat = (float) Math.floor(l); latm = 60.0f * (l - lat);}
                        else if (d.length == 2) {lat = Float.parseFloat(d[0]); latm = Float.parseFloat(d[1]);}
                        else if (d.length == 3) {lat = Float.parseFloat(d[0]); latm = Float.parseFloat(d[1]) + Float.parseFloat(d[2]) / 60.0f;}
                        if (d[d.length-1].toUpperCase().equals("S")) {}
                    }
                    if (c.toUpperCase().startsWith("EW=")) {
                        String d[] = c.substring(3).split("/");
                        if (d.length == 1) {float l = Float.parseFloat(d[0]); if (l < 0) {lono = "W"; l = -l;} lon = (float) Math.floor(l); lonm = 60.0f * (l - lon);}
                        else if (d.length == 2) {lon = Float.parseFloat(d[0]); lonm = Float.parseFloat(d[1]);}
                        else if (d.length == 3) {lon = Float.parseFloat(d[0]); lonm = Float.parseFloat(d[1]) + Float.parseFloat(d[2]) / 60.0f;}
                        if (d[d.length-1].toUpperCase().equals("w")) {lon = -lon; lonm = -lonm;}
                    }
                }
                if (lon != 0.0f && lat != 0.0f) {
                    // replace this with a format that the html parser can understand
                    line = line.substring(0, p) + (name.length() > 0 ? (" " + name) : "") + " <nobr> " + lato + " " + lat + "\u00B0 " + latm + "'</nobr><nobr>" + lono + " " + lon + "\u00B0 " + lonm + "'</nobr> " + line.substring(q + WIKI_CLOSE_METADATA.length());
                    s = p;
                    continue;
                }
            }
        }
        return line;
    }
    private class TableOfContent {
        private final List<String> toc = new ArrayList<String>();   // needs to be list which ensures order
--- a/source/de/anomic/search/ResultFetcher.java
+++ b/source/de/anomic/search/ResultFetcher.java
@ -385,7 +385,16 @@ public class ResultFetcher {
        final long dbRetrievalTime = System.currentTimeMillis() - startTime;
        if (cacheStrategy == null) {
-            return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, 0); // result without snippet
+            final TextSnippet snippet = new TextSnippet(
                    null,
                    metadata,
                    snippetFetchWordHashes,
                    null,
                    ((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof))),
                    180,
                    Integer.MAX_VALUE,
                    !query.isLocal());
            return new ResultEntry(page, query.getSegment(), peers, snippet, null, dbRetrievalTime, 0); // result without snippet
        }
        // load snippet
--- a/source/de/anomic/search/TextSnippet.java
+++ b/source/de/anomic/search/TextSnippet.java
@ -167,7 +167,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
            String loc;
            boolean noCacheUsage = url.isFile() || url.isSMB();
            boolean objectWasInCache = (noCacheUsage) ? false : de.anomic.http.client.Cache.has(url);
-            boolean useMetadata = !objectWasInCache && !cacheStrategy.mustBeOffline();
+            boolean useMetadata = !objectWasInCache && (cacheStrategy == null || !cacheStrategy.mustBeOffline());
            if (useMetadata && containsAllHashes(loc = comp.dc_title(), queryhashes)) {
                // try to create the snippet from information given in the url itself
                init(url.hash(), loc, ResultClass.SOURCE_METADATA, null);
@ -186,10 +186,10 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
                return;
            } else {
                // try to load the resource from the cache
-                response = loader.load(loader.request(url, true, reindexing), noCacheUsage ? CrawlProfile.CacheStrategy.NOCACHE : cacheStrategy, Long.MAX_VALUE, true);
+                response = loader == null ? null : loader.load(loader.request(url, true, reindexing), noCacheUsage ? CrawlProfile.CacheStrategy.NOCACHE : cacheStrategy, Long.MAX_VALUE, true);
                if (response == null) {
                    // in case that we did not get any result we can still return a success when we are not allowed to go online
-                    if (cacheStrategy.mustBeOffline()) {
+                    if (cacheStrategy == null || cacheStrategy.mustBeOffline()) {
                        init(url.hash(), null, ResultClass.ERROR_SOURCE_LOADING, "omitted network load (not allowed), no cache entry");
                        return;
                    }
--- a/source/net/yacy/cora/services/federated/opensearch/SRURSSConnector.java
+++ b/source/net/yacy/cora/services/federated/opensearch/SRURSSConnector.java
@ -199,7 +199,7 @@ public class SRURSSConnector extends Thread implements SearchAccumulator {
            parts.put("query", UTF8.StringBody(query));
            parts.put("startRecord", UTF8.StringBody(Integer.toString(startRecord)));
            parts.put("maximumRecords", UTF8.StringBody(Long.toString(maximumRecords)));
-            parts.put("verify", UTF8.StringBody(cacheStrategy.toName()));
+            parts.put("verify", cacheStrategy == null ? UTF8.StringBody("false") : UTF8.StringBody(cacheStrategy.toName()));
            parts.put("resource", UTF8.StringBody(global ? "global" : "local"));
            parts.put("nav", UTF8.StringBody("none"));
            result = HTTPConnector.getConnector(userAgent == null ? MultiProtocolURI.yacybotUserAgent : userAgent).post(new MultiProtocolURI(rssSearchServiceURL), (int) timeout, uri.getHost(), parts);
--- a/source/net/yacy/document/Document.java
+++ b/source/net/yacy/document/Document.java
@ -608,6 +608,7 @@ dc_rights
        String language = this.dc_language();
        if (language != null && language.length() > 0) os.write("<dc:language>" + this.dc_language() + "</dc:language>\n");
        os.write("<dc:date>" + ISO8601Formatter.FORMATTER.format(date) + "</dc:date>\n");
        if (this.lon != 0.0f && this.lat != 0.0f) os.write("<geo:long>" + this.lon +"</geo:long><geo:lat>" + this.lat + "</geo:lat>\n");
        os.write("</record>\n");
    }
--- a/source/net/yacy/document/importer/MediawikiImporter.java
+++ b/source/net/yacy/document/importer/MediawikiImporter.java
@ -504,7 +504,8 @@ public class MediawikiImporter extends Thread implements Importer {
        public void genDocument() throws Parser.Failure {
            try {
 				url = new DigestURI(urlStub + title);
-				document = Document.mergeDocuments(url, "text/html", TextParser.parseSource(url, "text/html", "UTF-8", UTF8.getBytes(html)));
+				Document[] parsed = TextParser.parseSource(url, "text/html", "UTF-8", UTF8.getBytes(html));
 				document = Document.mergeDocuments(url, "text/html", parsed);
 				// the wiki parser is not able to find the proper title in the source text, so it must be set here
 				document.setTitle(title);
 			} catch (MalformedURLException e1) {
--- a/source/net/yacy/document/parser/html/ContentScraper.java
+++ b/source/net/yacy/document/parser/html/ContentScraper.java
@ -124,19 +124,22 @@ public class ContentScraper extends AbstractScraper implements Scraper {
    public void scrapeText(final char[] newtext, final String insideTag) {
        // System.out.println("SCRAPE: " + UTF8.String(newtext));
-        int p, q, s = 0;
+        int p, pl, q, s = 0;
        // try to find location information in text
        // Opencaching:
        // <nobr>N 50o 05.453&#039;</nobr><nobr>E 008o 30.191&#039;</nobr>
        // N 52o 28.025 E 013o 20.299
        location: while (s < newtext.length) {
            pl = 1;
            p = CharBuffer.indexOf(newtext, s, degree);
            if (p < 0) {p = CharBuffer.indexOf(newtext, s, "&deg;".toCharArray()); if (p >= 0) pl = 5;}
            if (p < 0) break location;
-            // try to find a coordinate
+            q = CharBuffer.indexOf(newtext, p + pl, minuteCharsHTML);
-            // <nobr>N 50o 05.453&#039;</nobr><nobr>E 008o 30.191&#039;</nobr>
+            if (q < 0) q = CharBuffer.indexOf(newtext, p + pl, "'".toCharArray());
-            // N 52o 28.025 E 013o 20.299
+            if (q < 0) q = CharBuffer.indexOf(newtext, p + pl, " E".toCharArray());
-            q = CharBuffer.indexOf(newtext, p, minuteCharsHTML);
+            if (q < 0) q = CharBuffer.indexOf(newtext, p + pl, " W".toCharArray());
-            if (q < 0) q = CharBuffer.indexOf(newtext, p, " E".toCharArray());
+            if (q < 0 && newtext.length - p == 7 + pl) q = newtext.length; 
            if (q < 0) q = CharBuffer.indexOf(newtext, p, " W".toCharArray());
            if (q < 0 && newtext.length - p == 8) q = newtext.length; 
            if (q < 0) break location;
            int r = p;
            while (r-- > 1) {
@ -144,25 +147,29 @@ public class ContentScraper extends AbstractScraper implements Scraper {
                    r--;
                    if (newtext[r] == 'N') {
                        this.lat =  Float.parseFloat(new String(newtext, r + 2, p - r - 2)) +
-                                    Float.parseFloat(new String(newtext, p + 2, q - p - 2)) / 60.0f;
+                                    Float.parseFloat(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0f;
                        if (this.lon != 0.0f) break location;
                        s = q + 6;
                        continue location;
                    }
                    if (newtext[r] == 'S') {
                        this.lat = -Float.parseFloat(new String(newtext, r + 2, p - r - 2)) -
-                                    Float.parseFloat(new String(newtext, p + 2, q - p - 2)) / 60.0f;
+                                    Float.parseFloat(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0f;
                        if (this.lon != 0.0f) break location;
                        s = q + 6;
                        continue location;
                    }
                    if (newtext[r] == 'E') {
                        this.lon =  Float.parseFloat(new String(newtext, r + 2, p - r - 2)) +
-                                    Float.parseFloat(new String(newtext, p + 2, q - p - 2)) / 60.0f;
+                                    Float.parseFloat(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0f;
                        if (this.lat != 0.0f) break location;
                        s = q + 6;
                        continue location;
                    }
                    if (newtext[r] == 'W') {
                        this.lon = -Float.parseFloat(new String(newtext, r + 2, p - r - 2)) -
-                                    Float.parseFloat(new String(newtext, p + 2, q - p - 2)) / 60.0f;
+                                    Float.parseFloat(new String(newtext, p + 2, q - p - pl - 1)) / 60.0f;
                        if (this.lat != 0.0f) break location;
                        s = q + 6;
                        continue location;
                    }