implement ajax crawling scheme for ajax sites which adhere to the proposed use of hash-bangs to provide html content

see freshly deprecated https://developers.google.com/webmasters/ajax-crawling/ Implementation improves parsing of the homepage (ajax page) which uses metatag "fragment" in header and parses supplied html snapshot instead of mostly empty ajax/scripted page. Implementation supports also hash-bang urls (url with anchor starting with ! like ...path#!hashfragment) but our crawler filters it (use of hash-bang is controversly discussed and proposal is deprecated, makes no sense to adjust the crawler, but as long as it is used by some sites the minor change/improvement in htmlparser is good for some time). Quick - how does it work - if metatag fragment with content "!" is found - htmlparser tries to get content of htmls snapshot (using a different url) - htmlparser returns 2 documents (original url and snapshot content - but using same original url) - after parsing result documents are joined (and stored to index containing content also from snapshot page... as the original ajax page contains typically no parseable html content)
9 years ago · 9252e36aeb
parent d1ae999ef9
commit 9252e36aeb
1 changed files with 60 additions and 4 deletions
--- a/source/net/yacy/document/parser/htmlParser.java
+++ b/source/net/yacy/document/parser/htmlParser.java
@ -40,6 +40,7 @@ import java.util.Set;
 import net.yacy.cora.document.encoding.UTF8;
 import net.yacy.cora.document.id.AnchorURL;
 import net.yacy.cora.document.id.DigestURL;
+import net.yacy.cora.document.id.MultiProtocolURL;
 import net.yacy.cora.protocol.ClientIdentification;
 import net.yacy.cora.util.CommonPattern;
 import net.yacy.document.AbstractParser;
@ -60,7 +61,7 @@ public class htmlParser extends AbstractParser implements Parser {
    private static final int maxLinks = 10000;

    public final static String[] htmlExtensions = new String[]{
-        "htm","html","phtml","shtml","shtm","stm","xhtml","phtml","phtm",
+        "htm","html","shtml","shtm","stm","xhtml","phtml","phtm",
        "tpl","php","php2","php3","php4","php5","cfm","asp","aspx","tex","txt","msg"
        }; 
    
@ -99,11 +100,26 @@ public class htmlParser extends AbstractParser implements Parser {
            final ContentScraper scraper = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, sourceStream, maxLinks);
            // parseToScraper also detects/corrects/sets charset from html content tag
            final Document document = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraper);
+            Document documentSnapshot = null;
+            try {
+                // check for ajax crawling scheme (https://developers.google.com/webmasters/ajax-crawling/docs/specification)
+                // and create a sub-document for snapshot page (which will be merged by loader)
+                // TODO: as a crawl request removes anchor part from original url getRef() is never successful - considere other handling as removeRef() in crawler
+                if (location.getRef() != null && location.getRef().startsWith("!")) {
+                    documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset);
+                } else { // head tag fragment only allowed on url without anchor hashfragment, but there are discussions that existence of hashfragment anchor takes preference (means allow both)
+                    if (scraper.getMetas().containsKey("fragment") && scraper.getMetas().get("fragment").equals("!")) {
+                        documentSnapshot = parseAlternativeSnapshot(location, mimeType, documentCharset, vocscraper, timezoneOffset);
+                    }
+                }
+            } catch (Exception ex1) { // ignore any exception for any issue with snapshot
+                documentSnapshot = null;
+            }

-            return new Document[]{document};
+            return documentSnapshot == null ? new Document[]{document} : new Document[]{document, documentSnapshot};
        } catch (final IOException e) {
-			throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location);
-		}
+            throw new Parser.Failure("IOException in htmlParser: " + e.getMessage(), location);
+        }
    }

    /**
@ -322,6 +338,46 @@ public class htmlParser extends AbstractParser implements Parser {
        return encoding;
    }

+    /**
+     * Implementation of ajax crawling scheme to crawl the content of html snapshot page
+     * instead of the (empty) original ajax url
+     * see https://developers.google.com/webmasters/ajax-crawling/docs/specification
+     * Ajax crawling sheme is denoted by url with anchor param starting with "!" (1)
+     * or by a header tag <meta name="fragment" content="!"/>
+     *
+     * It is expected that the check for ajax crawling scheme happend already so we can directly
+     * try to get the snapshot page
+     *
+     * @param location original url (ajax url)
+     * @param mimeType
+     * @param documentCharset
+     * @param vocscraper
+     * @param timezoneOffset
+     * @return document as result of parsed snapshot or null if not exist or on any other issue with snapshot
+     */
+    private Document parseAlternativeSnapshot(final DigestURL location, final String mimeType, final String documentCharset,
+            final VocabularyScraper vocscraper, final int timezoneOffset) {
+        Document documentSnapshot = null;
+        try {
+            // construct url for case (1) with anchor
+            final DigestURL locationSnapshot;
+            if (location.getRef() != null && !location.getRef().isEmpty() && location.getRef().startsWith("!")) {
+                if (location.getSearchpart().isEmpty()) {
+                    // according to spec hashfragment to be escaped
+                    locationSnapshot = new DigestURL(location.toNormalform(true) + "?_escaped_fragment_=" + MultiProtocolURL.escape(location.getRef().substring(1)));
+                } else {
+                    locationSnapshot = new DigestURL(location.toNormalform(true) + "&_escaped_fragment_=" + MultiProtocolURL.escape(location.getRef().substring(1)).toString());
+                }
+            } else { // construct url for case (2) - no anchor but header tag fragment="!"
+                locationSnapshot = new DigestURL(location.toNormalform(true) + "?_escaped_fragment_=");
+            }
+            Charset[] detectedcharsetcontainer = new Charset[]{null};
+            ContentScraper scraperSnapshot = parseToScraper(location, documentCharset, vocscraper, detectedcharsetcontainer, timezoneOffset, locationSnapshot.getInputStream(ClientIdentification.yacyInternetCrawlerAgent, null, null), maxLinks);
+            documentSnapshot = transformScraper(location, mimeType, detectedcharsetcontainer[0].name(), scraperSnapshot);
+        } catch (IOException | Failure ex) { }
+        return documentSnapshot;
+    }
+
    public static void main(final String[] args) {
        // test parsing of a url
        AnchorURL url;