enhanced location search:

search is now done using verify=false (instead of verify=cacheonly) which will cause that much more targets can be found.
This showed a bug where no location information was used from the metadata (and other metadata information) if cache=false is requested. The bug was fixed.

Added also location parsing from wikimedia dumps. A wikipedia dump can now also be a source for a location search.
Fixed many smaller bugs in connection with location search.

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@7657 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 14 years ago
parent 8d63f3b70f
commit 958ff4778e

@ -79,24 +79,23 @@
</head> </head>
<body id="yacysearch_location" onload="init();"> <body id="yacysearch_location" onload="init();">
#(display)# #(topmenu)#
#%env/templates/simpleheader.template%#
::
#%env/templates/header.template%#
::
#%env/templates/embeddedheader.template%# #%env/templates/embeddedheader.template%#
#(/display)# ::
<div id="api"> <div id="api">
<a href="yacysearch_location.rss" id="apilink"><img src="/env/grafics/api.png" width="60" height="40" alt="API"/></a> <a href="yacysearch_location.rss" id="apilink"><img src="/env/grafics/api.png" width="60" height="40" alt="API"/></a>
<script type="text/javascript"> <script type="text/javascript">
//<![CDATA[ //<![CDATA[
document.getElementById('apilink').setAttribute('href', 'yacysearch_location.rss?' + window.location.search.substring(1)); document.getElementById('apilink').setAttribute('href', 'yacysearch_location.rss?dom=metatag|alltext&' + window.location.search.substring(1));
//]]> //]]>
</script> </script>
<span>The information that is presented on this page can also be retrieved as XML <span>The information that is presented on this page can also be retrieved as XML
Click the API icon to see the XML. Click the API icon to see the XML.
To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de/wiki/index.php/Dev:API">API wiki page</a>.</span> To see a list of all APIs, please visit the <a href="http://www.yacy-websuche.de/wiki/index.php/Dev:API">API wiki page</a>.</span>
</div> </div>
#%env/templates/simpleheader.template%#
#(/topmenu)#
<form class="search small" onsubmit="return false;" class="search small" accept-charset="UTF-8"> <form class="search small" onsubmit="return false;" class="search small" accept-charset="UTF-8">
<h2>#[promoteSearchPageGreeting]#</h2> <h2>#[promoteSearchPageGreeting]#</h2>
<div class="yacylogo"><a href="#[promoteSearchPageGreeting.homepage]#" class="yacylogo"><img src="#[promoteSearchPageGreeting.smallImage]#" alt="yacysearch" /></a></div> <div class="yacylogo"><a href="#[promoteSearchPageGreeting.homepage]#" class="yacylogo"><img src="#[promoteSearchPageGreeting.smallImage]#" alt="yacysearch" /></a></div>

@ -29,7 +29,6 @@ import net.yacy.cora.protocol.RequestHeader;
import net.yacy.cora.services.federated.opensearch.SRURSSConnector; import net.yacy.cora.services.federated.opensearch.SRURSSConnector;
import net.yacy.document.LibraryProvider; import net.yacy.document.LibraryProvider;
import net.yacy.document.geolocalization.Location; import net.yacy.document.geolocalization.Location;
import de.anomic.crawler.CrawlProfile;
import de.anomic.search.Switchboard; import de.anomic.search.Switchboard;
import de.anomic.search.SwitchboardConstants; import de.anomic.search.SwitchboardConstants;
import de.anomic.server.serverCore; import de.anomic.server.serverCore;
@ -94,7 +93,7 @@ public class yacysearch_location {
// get a queue of search results // get a queue of search results
String rssSearchServiceURL = "http://127.0.0.1:" + sb.getConfig("port", "8090") + "/yacysearch.rss"; String rssSearchServiceURL = "http://127.0.0.1:" + sb.getConfig("port", "8090") + "/yacysearch.rss";
BlockingQueue<RSSMessage> results = new LinkedBlockingQueue<RSSMessage>(); BlockingQueue<RSSMessage> results = new LinkedBlockingQueue<RSSMessage>();
SRURSSConnector.searchSRURSS(results, rssSearchServiceURL, query, maximumTime, Integer.MAX_VALUE, CrawlProfile.CacheStrategy.NOCACHE, false, null); SRURSSConnector.searchSRURSS(results, rssSearchServiceURL, query, maximumTime, Integer.MAX_VALUE, null, false, null);
// take the results and compute some locations // take the results and compute some locations
RSSMessage message; RSSMessage message;
@ -164,10 +163,7 @@ public class yacysearch_location {
} }
if (header.get(HeaderFramework.CONNECTION_PROP_EXT, "").equals("html")) { if (header.get(HeaderFramework.CONNECTION_PROP_EXT, "").equals("html")) {
final boolean authenticated = sb.adminAuthenticated(header) >= 2; prop.put("topmenu", sb.getConfigBool("publicTopmenu", true) ? 1 : 0);
int display = (post == null) ? 0 : post.getInt("display", 0);
if (!authenticated) display = 2;
prop.put("display", display);
prop.put("promoteSearchPageGreeting", sb.getConfig(SwitchboardConstants.GREETING, "")); prop.put("promoteSearchPageGreeting", sb.getConfig(SwitchboardConstants.GREETING, ""));
prop.put("promoteSearchPageGreeting.homepage", sb.getConfig(SwitchboardConstants.GREETING_HOMEPAGE, "")); prop.put("promoteSearchPageGreeting.homepage", sb.getConfig(SwitchboardConstants.GREETING_HOMEPAGE, ""));
prop.put("promoteSearchPageGreeting.smallImage", sb.getConfig(SwitchboardConstants.GREETING_SMALL_IMAGE, "")); prop.put("promoteSearchPageGreeting.smallImage", sb.getConfig(SwitchboardConstants.GREETING_SMALL_IMAGE, ""));

@ -249,6 +249,7 @@ public class CrawlProfile extends ConcurrentHashMap<String, String> implements M
IFFRESH(1), // use the cache if the cache exists and is fresh using the proxy-fresh rules IFFRESH(1), // use the cache if the cache exists and is fresh using the proxy-fresh rules
IFEXIST(2), // use the cache if the cache exist. Do no check freshness. Otherwise use online source. IFEXIST(2), // use the cache if the cache exist. Do no check freshness. Otherwise use online source.
CACHEONLY(3); // never go online, use all content from cache. If no cache entry exist, consider content nevertheless as available CACHEONLY(3); // never go online, use all content from cache. If no cache entry exist, consider content nevertheless as available
// the fifth case may be that the CacheStrategy object is assigned NULL. That means that no snippet creation is wanted.
public int code; public int code;
private CacheStrategy(int code) { private CacheStrategy(int code) {
this.code = code; this.code = code;

@ -107,6 +107,8 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
private static final String WIKI_CLOSE_LINK = "]]"; private static final String WIKI_CLOSE_LINK = "]]";
private static final String WIKI_OPEN_LINK = "[["; private static final String WIKI_OPEN_LINK = "[[";
private static final String WIKI_CLOSE_METADATA = "}}";
private static final String WIKI_OPEN_METADATA = "{{";
private static final String WIKI_CLOSE_EXTERNAL_LINK = "]"; private static final String WIKI_CLOSE_EXTERNAL_LINK = "]";
private static final String WIKI_OPEN_EXTERNAL_LINK = "["; private static final String WIKI_OPEN_EXTERNAL_LINK = "[";
private static final String WIKI_CLOSE_PRE_ESCAPED = "&lt;/pre&gt;"; private static final String WIKI_CLOSE_PRE_ESCAPED = "&lt;/pre&gt;";
@ -926,6 +928,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
*/ */
public String processLineOfWikiCode(String hostport, String line) { public String processLineOfWikiCode(String hostport, String line) {
//If HTML has not been replaced yet (can happen if method gets called in recursion), replace now! //If HTML has not been replaced yet (can happen if method gets called in recursion), replace now!
line = processMetadata(line);
if ((!replacedHtmlAlready || preformattedSpanning) && line.indexOf(WIKI_CLOSE_PRE_ESCAPED) < 0) { if ((!replacedHtmlAlready || preformattedSpanning) && line.indexOf(WIKI_CLOSE_PRE_ESCAPED) < 0) {
line = CharacterCoding.unicode2html(line, true); line = CharacterCoding.unicode2html(line, true);
replacedHtmlAlready = true; replacedHtmlAlready = true;
@ -974,6 +977,7 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
line = tagReplace(line, Tags.STRIKE); line = tagReplace(line, Tags.STRIKE);
line = processUnorderedList(line); line = processUnorderedList(line);
line = processOrderedList(line); line = processOrderedList(line);
line = processDefinitionList(line); line = processDefinitionList(line);
@ -991,6 +995,58 @@ public class WikiCode extends AbstractWikiParser implements WikiParser {
return line; return line;
} }
public String processMetadata(String line) {
int p, q, s = 0;
while ((p = line.indexOf(WIKI_OPEN_METADATA, s)) >= 0 && (q = line.indexOf(WIKI_CLOSE_METADATA, p + 1)) >= 0) {
s = q; // continue with next position
String a = line.substring(p + 2, q);
if (a.toLowerCase().startsWith("coordinate")) {
// parse Geographical Coordinates as described in
// http://en.wikipedia.org/wiki/Wikipedia:Manual_of_Style_%28dates_and_numbers%29#Geographical_coordinates
// looks like:
// {{Coord|57|18|22.5|N|4|27|32.7|W|display=title}}
// however, such information does not appear as defined above but as:
// {{coordinate|NS=52.205944|EW=0.117593|region=GB-CAM|type=landmark}}
// {{coordinate|NS=43/50/29/N|EW=73/23/17/W|type=landmark|region=US-NY}}
// and if passed through this parser:
// {{Coordinate |NS 45/37/43.0/N |EW. 07/58/41.0/E |type=landmark |region=IT-BI}} ## means: degree/minute/second
// {{Coordinate |NS 51.48994 |EW. 7.33249 |type=landmark |region=DE-NW}}
String b[] = a.split("\\|");
float lon = 0.0f, lat = 0.0f;
float lonm = 0.0f, latm = 0.0f;
String lono = "E", lato = "N";
String name = "";
for (String c: b) {
if (c.toLowerCase().startsWith("name=")) {
name = c.substring(5);
}
if (c.toUpperCase().startsWith("NS=")) {
String d[] = c.substring(3).split("/");
if (d.length == 1) {float l = Float.parseFloat(d[0]); if (l < 0) {lato = "S"; l = -l;} lat = (float) Math.floor(l); latm = 60.0f * (l - lat);}
else if (d.length == 2) {lat = Float.parseFloat(d[0]); latm = Float.parseFloat(d[1]);}
else if (d.length == 3) {lat = Float.parseFloat(d[0]); latm = Float.parseFloat(d[1]) + Float.parseFloat(d[2]) / 60.0f;}
if (d[d.length-1].toUpperCase().equals("S")) {}
}
if (c.toUpperCase().startsWith("EW=")) {
String d[] = c.substring(3).split("/");
if (d.length == 1) {float l = Float.parseFloat(d[0]); if (l < 0) {lono = "W"; l = -l;} lon = (float) Math.floor(l); lonm = 60.0f * (l - lon);}
else if (d.length == 2) {lon = Float.parseFloat(d[0]); lonm = Float.parseFloat(d[1]);}
else if (d.length == 3) {lon = Float.parseFloat(d[0]); lonm = Float.parseFloat(d[1]) + Float.parseFloat(d[2]) / 60.0f;}
if (d[d.length-1].toUpperCase().equals("w")) {lon = -lon; lonm = -lonm;}
}
}
if (lon != 0.0f && lat != 0.0f) {
// replace this with a format that the html parser can understand
line = line.substring(0, p) + (name.length() > 0 ? (" " + name) : "") + " <nobr> " + lato + " " + lat + "\u00B0 " + latm + "'</nobr><nobr>" + lono + " " + lon + "\u00B0 " + lonm + "'</nobr> " + line.substring(q + WIKI_CLOSE_METADATA.length());
s = p;
continue;
}
}
}
return line;
}
private class TableOfContent { private class TableOfContent {
private final List<String> toc = new ArrayList<String>(); // needs to be list which ensures order private final List<String> toc = new ArrayList<String>(); // needs to be list which ensures order

@ -385,7 +385,16 @@ public class ResultFetcher {
final long dbRetrievalTime = System.currentTimeMillis() - startTime; final long dbRetrievalTime = System.currentTimeMillis() - startTime;
if (cacheStrategy == null) { if (cacheStrategy == null) {
return new ResultEntry(page, query.getSegment(), peers, null, null, dbRetrievalTime, 0); // result without snippet final TextSnippet snippet = new TextSnippet(
null,
metadata,
snippetFetchWordHashes,
null,
((query.constraint != null) && (query.constraint.get(Condenser.flag_cat_indexof))),
180,
Integer.MAX_VALUE,
!query.isLocal());
return new ResultEntry(page, query.getSegment(), peers, snippet, null, dbRetrievalTime, 0); // result without snippet
} }
// load snippet // load snippet

@ -167,7 +167,7 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
String loc; String loc;
boolean noCacheUsage = url.isFile() || url.isSMB(); boolean noCacheUsage = url.isFile() || url.isSMB();
boolean objectWasInCache = (noCacheUsage) ? false : de.anomic.http.client.Cache.has(url); boolean objectWasInCache = (noCacheUsage) ? false : de.anomic.http.client.Cache.has(url);
boolean useMetadata = !objectWasInCache && !cacheStrategy.mustBeOffline(); boolean useMetadata = !objectWasInCache && (cacheStrategy == null || !cacheStrategy.mustBeOffline());
if (useMetadata && containsAllHashes(loc = comp.dc_title(), queryhashes)) { if (useMetadata && containsAllHashes(loc = comp.dc_title(), queryhashes)) {
// try to create the snippet from information given in the url itself // try to create the snippet from information given in the url itself
init(url.hash(), loc, ResultClass.SOURCE_METADATA, null); init(url.hash(), loc, ResultClass.SOURCE_METADATA, null);
@ -186,10 +186,10 @@ public class TextSnippet implements Comparable<TextSnippet>, Comparator<TextSnip
return; return;
} else { } else {
// try to load the resource from the cache // try to load the resource from the cache
response = loader.load(loader.request(url, true, reindexing), noCacheUsage ? CrawlProfile.CacheStrategy.NOCACHE : cacheStrategy, Long.MAX_VALUE, true); response = loader == null ? null : loader.load(loader.request(url, true, reindexing), noCacheUsage ? CrawlProfile.CacheStrategy.NOCACHE : cacheStrategy, Long.MAX_VALUE, true);
if (response == null) { if (response == null) {
// in case that we did not get any result we can still return a success when we are not allowed to go online // in case that we did not get any result we can still return a success when we are not allowed to go online
if (cacheStrategy.mustBeOffline()) { if (cacheStrategy == null || cacheStrategy.mustBeOffline()) {
init(url.hash(), null, ResultClass.ERROR_SOURCE_LOADING, "omitted network load (not allowed), no cache entry"); init(url.hash(), null, ResultClass.ERROR_SOURCE_LOADING, "omitted network load (not allowed), no cache entry");
return; return;
} }

@ -199,7 +199,7 @@ public class SRURSSConnector extends Thread implements SearchAccumulator {
parts.put("query", UTF8.StringBody(query)); parts.put("query", UTF8.StringBody(query));
parts.put("startRecord", UTF8.StringBody(Integer.toString(startRecord))); parts.put("startRecord", UTF8.StringBody(Integer.toString(startRecord)));
parts.put("maximumRecords", UTF8.StringBody(Long.toString(maximumRecords))); parts.put("maximumRecords", UTF8.StringBody(Long.toString(maximumRecords)));
parts.put("verify", UTF8.StringBody(cacheStrategy.toName())); parts.put("verify", cacheStrategy == null ? UTF8.StringBody("false") : UTF8.StringBody(cacheStrategy.toName()));
parts.put("resource", UTF8.StringBody(global ? "global" : "local")); parts.put("resource", UTF8.StringBody(global ? "global" : "local"));
parts.put("nav", UTF8.StringBody("none")); parts.put("nav", UTF8.StringBody("none"));
result = HTTPConnector.getConnector(userAgent == null ? MultiProtocolURI.yacybotUserAgent : userAgent).post(new MultiProtocolURI(rssSearchServiceURL), (int) timeout, uri.getHost(), parts); result = HTTPConnector.getConnector(userAgent == null ? MultiProtocolURI.yacybotUserAgent : userAgent).post(new MultiProtocolURI(rssSearchServiceURL), (int) timeout, uri.getHost(), parts);

@ -608,6 +608,7 @@ dc_rights
String language = this.dc_language(); String language = this.dc_language();
if (language != null && language.length() > 0) os.write("<dc:language>" + this.dc_language() + "</dc:language>\n"); if (language != null && language.length() > 0) os.write("<dc:language>" + this.dc_language() + "</dc:language>\n");
os.write("<dc:date>" + ISO8601Formatter.FORMATTER.format(date) + "</dc:date>\n"); os.write("<dc:date>" + ISO8601Formatter.FORMATTER.format(date) + "</dc:date>\n");
if (this.lon != 0.0f && this.lat != 0.0f) os.write("<geo:long>" + this.lon +"</geo:long><geo:lat>" + this.lat + "</geo:lat>\n");
os.write("</record>\n"); os.write("</record>\n");
} }

@ -504,7 +504,8 @@ public class MediawikiImporter extends Thread implements Importer {
public void genDocument() throws Parser.Failure { public void genDocument() throws Parser.Failure {
try { try {
url = new DigestURI(urlStub + title); url = new DigestURI(urlStub + title);
document = Document.mergeDocuments(url, "text/html", TextParser.parseSource(url, "text/html", "UTF-8", UTF8.getBytes(html))); Document[] parsed = TextParser.parseSource(url, "text/html", "UTF-8", UTF8.getBytes(html));
document = Document.mergeDocuments(url, "text/html", parsed);
// the wiki parser is not able to find the proper title in the source text, so it must be set here // the wiki parser is not able to find the proper title in the source text, so it must be set here
document.setTitle(title); document.setTitle(title);
} catch (MalformedURLException e1) { } catch (MalformedURLException e1) {

@ -124,19 +124,22 @@ public class ContentScraper extends AbstractScraper implements Scraper {
public void scrapeText(final char[] newtext, final String insideTag) { public void scrapeText(final char[] newtext, final String insideTag) {
// System.out.println("SCRAPE: " + UTF8.String(newtext)); // System.out.println("SCRAPE: " + UTF8.String(newtext));
int p, q, s = 0; int p, pl, q, s = 0;
// try to find location information in text // try to find location information in text
// Opencaching:
// <nobr>N 50o 05.453&#039;</nobr><nobr>E 008o 30.191&#039;</nobr>
// N 52o 28.025 E 013o 20.299
location: while (s < newtext.length) { location: while (s < newtext.length) {
pl = 1;
p = CharBuffer.indexOf(newtext, s, degree); p = CharBuffer.indexOf(newtext, s, degree);
if (p < 0) {p = CharBuffer.indexOf(newtext, s, "&deg;".toCharArray()); if (p >= 0) pl = 5;}
if (p < 0) break location; if (p < 0) break location;
// try to find a coordinate q = CharBuffer.indexOf(newtext, p + pl, minuteCharsHTML);
// <nobr>N 50o 05.453&#039;</nobr><nobr>E 008o 30.191&#039;</nobr> if (q < 0) q = CharBuffer.indexOf(newtext, p + pl, "'".toCharArray());
// N 52o 28.025 E 013o 20.299 if (q < 0) q = CharBuffer.indexOf(newtext, p + pl, " E".toCharArray());
q = CharBuffer.indexOf(newtext, p, minuteCharsHTML); if (q < 0) q = CharBuffer.indexOf(newtext, p + pl, " W".toCharArray());
if (q < 0) q = CharBuffer.indexOf(newtext, p, " E".toCharArray()); if (q < 0 && newtext.length - p == 7 + pl) q = newtext.length;
if (q < 0) q = CharBuffer.indexOf(newtext, p, " W".toCharArray());
if (q < 0 && newtext.length - p == 8) q = newtext.length;
if (q < 0) break location; if (q < 0) break location;
int r = p; int r = p;
while (r-- > 1) { while (r-- > 1) {
@ -144,25 +147,29 @@ public class ContentScraper extends AbstractScraper implements Scraper {
r--; r--;
if (newtext[r] == 'N') { if (newtext[r] == 'N') {
this.lat = Float.parseFloat(new String(newtext, r + 2, p - r - 2)) + this.lat = Float.parseFloat(new String(newtext, r + 2, p - r - 2)) +
Float.parseFloat(new String(newtext, p + 2, q - p - 2)) / 60.0f; Float.parseFloat(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0f;
if (this.lon != 0.0f) break location;
s = q + 6; s = q + 6;
continue location; continue location;
} }
if (newtext[r] == 'S') { if (newtext[r] == 'S') {
this.lat = -Float.parseFloat(new String(newtext, r + 2, p - r - 2)) - this.lat = -Float.parseFloat(new String(newtext, r + 2, p - r - 2)) -
Float.parseFloat(new String(newtext, p + 2, q - p - 2)) / 60.0f; Float.parseFloat(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0f;
if (this.lon != 0.0f) break location;
s = q + 6; s = q + 6;
continue location; continue location;
} }
if (newtext[r] == 'E') { if (newtext[r] == 'E') {
this.lon = Float.parseFloat(new String(newtext, r + 2, p - r - 2)) + this.lon = Float.parseFloat(new String(newtext, r + 2, p - r - 2)) +
Float.parseFloat(new String(newtext, p + 2, q - p - 2)) / 60.0f; Float.parseFloat(new String(newtext, p + pl + 1, q - p - pl - 1)) / 60.0f;
if (this.lat != 0.0f) break location;
s = q + 6; s = q + 6;
continue location; continue location;
} }
if (newtext[r] == 'W') { if (newtext[r] == 'W') {
this.lon = -Float.parseFloat(new String(newtext, r + 2, p - r - 2)) - this.lon = -Float.parseFloat(new String(newtext, r + 2, p - r - 2)) -
Float.parseFloat(new String(newtext, p + 2, q - p - 2)) / 60.0f; Float.parseFloat(new String(newtext, p + 2, q - p - pl - 1)) / 60.0f;
if (this.lat != 0.0f) break location;
s = q + 6; s = q + 6;
continue location; continue location;
} }

Loading…
Cancel
Save