added refresh parser to htmlFilterContentScraper

* getRefreshSeconds() - number of seconds until refresh
* getRefreshPath() - url path
See also: http://www.yacy-forum.de/viewtopic.php?p=16851#16851

git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@1657 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
orbiter 19 years ago
parent ba5fe0b287
commit 56516fd8e6

@ -214,7 +214,18 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
if (tagname.equalsIgnoreCase("img")) images.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt","")); if (tagname.equalsIgnoreCase("img")) images.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("alt",""));
if (tagname.equalsIgnoreCase("base")) try {root = new URL(tagopts.getProperty("href", ""));} catch (MalformedURLException e) {} if (tagname.equalsIgnoreCase("base")) try {root = new URL(tagopts.getProperty("href", ""));} catch (MalformedURLException e) {}
if (tagname.equalsIgnoreCase("frame")) anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name","")); if (tagname.equalsIgnoreCase("frame")) anchors.put(absolutePath(tagopts.getProperty("src", "")), tagopts.getProperty("name",""));
if (tagname.equalsIgnoreCase("meta")) metas.put((tagopts.getProperty("name", "")).toLowerCase(), tagopts.getProperty("content","")); if (tagname.equalsIgnoreCase("meta")) {
String name = tagopts.getProperty("name", "");
if (name.length() > 0) {
metas.put(name.toLowerCase(), tagopts.getProperty("content",""));
return;
}
name = tagopts.getProperty("http-equiv", "");
if (name.length() > 0) {
metas.put(name.toLowerCase(), tagopts.getProperty("content",""));
return;
}
}
} }
public void scrapeTag1(String tagname, Properties tagopts, byte[] text) { public void scrapeTag1(String tagname, Properties tagopts, byte[] text) {
@ -330,6 +341,28 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
} }
} }
public int getRefreshSeconds() {
String s = (String) metas.get("refresh");
if (s == null) return 9999; else try {
int pos = s.indexOf(';');
if (pos < 0) return 9999;
int i = Integer.parseInt(s.substring(0, pos));
return i;
} catch (NumberFormatException e) {
return 9999;
}
}
public String getRefreshPath() {
String s = (String) metas.get("refresh");
if (s == null) return ""; else {
int pos = s.indexOf(';');
if (pos < 0) return "";
s = s.substring(pos + 1);
if (s.toLowerCase().startsWith("url=")) return s.substring(4).trim(); else return "";
}
}
/* /*
* (non-Javadoc) * (non-Javadoc)
* @see de.anomic.htmlFilter.htmlFilterScraper#close() * @see de.anomic.htmlFilter.htmlFilterScraper#close()

Loading…
Cancel
Save