yacy_search_server/source/net/yacy/document/parser/apkParser.java

/**
 *  apkParser
 *  Copyright 2014 by Michael Peter Christen
 *  First released 09.06.2014 at http://yacy.net
 *
 *  This library is free software; you can redistribute it and/or
 *  modify it under the terms of the GNU Lesser General Public
 *  License as published by the Free Software Foundation; either
 *  version 2.1 of the License, or (at your option) any later version.
 *
 *  This library is distributed in the hope that it will be useful,
 *  but WITHOUT ANY WARRANTY; without even the implied warranty of
 *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 *  Lesser General Public License for more details.
 *
 *  You should have received a copy of the GNU Lesser General Public License
 *  along with this program in the file lgpl21.txt
 *  If not, see <http://www.gnu.org/licenses/>.
 */

package net.yacy.document.parser;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Date;
import java.util.Enumeration;
import java.util.HashSet;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.jar.JarEntry;
import java.util.jar.JarFile;

import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.document.AbstractParser;
import net.yacy.document.Document;
import net.yacy.document.Parser;
import net.yacy.document.VocabularyScraper;

public class apkParser extends AbstractParser implements Parser  {

    public apkParser() {
        super("Android Application Parser");
        this.SUPPORTED_EXTENSIONS.add("apk");
        this.SUPPORTED_MIME_TYPES.add("application/vnd.android.package-archive");
    }

    @Override
    public Document[] parse(
            final DigestURL location,
            final String mimeType,
            final String charset,
            final VocabularyScraper scraper,
            final int timezoneOffset,
            final InputStream source) throws Parser.Failure, InterruptedException {

        /*
         * things to discover:
         * - name
         * - version
         * - signature hash
         * - class root (to identify same apps for different versions)
         * - author (name of signer)
         * - strings from resources
         */
        Document[] docs = null;
        try {
            File tempFile = File.createTempFile("apk" + System.currentTimeMillis(), "jar");
            final FileOutputStream out = new FileOutputStream(tempFile);
            int read = 0;
            final byte[] data = new byte[1024];
            while((read = source.read(data, 0, 1024)) != -1) {
                out.write(data, 0, read);
            }
            out.close();
            JarFile jf = new JarFile(tempFile);
            docs = parse(location, mimeType, charset, jf);
            tempFile.delete();
        } catch (IOException e) {
            ConcurrentLog.logException(e);
        }
        return docs;
    }

    public Document[] parse(final DigestURL location, final String mimeType, final String charset, final JarFile jf) {
        StringBuilder sb = new StringBuilder();
        String title = location.getFileName();
        AndroidManifestParser manifest = null;
        try {
            InputStream is = jf.getInputStream(jf.getEntry("AndroidManifest.xml"));
            byte[] xml = new byte[is.available()];
            is.read(xml);
            manifest = new AndroidManifestParser(xml, true);
            title = location.getFileName() + " " + manifest.packageName + " " + manifest.versionName;
            sb.append(title).append(". ");
            for (String p: manifest.permissions) sb.append(p).append(". ");
        } catch (IOException e) {
            ConcurrentLog.logException(e);
        }

        Enumeration<JarEntry> je = jf.entries();
        while (je.hasMoreElements()) {
            String path = je.nextElement().toString();
            sb.append(path).append(". ");
        }

        final Collection<AnchorURL> links = new ArrayList<>();
        try {
            InputStream is = jf.getInputStream(jf.getEntry("resources.arsc"));
            List<String> resources = resourcesArscParser(is);
            for (String s: resources) {
                sb.append(s).append(". ");
                int p = s.indexOf("http://");
                if (p < 0) p = s.indexOf("https://");
                if (p < 0) p = s.indexOf("ftp://");
                if (p >= 0) {
                    int q = s.indexOf(' ', p + 1);
                    String link = q < 0 ? s.substring(p) : s.substring(p, q);
                    try {
                        links.add(new AnchorURL(link));
                    } catch (MalformedURLException e) {}
                }
            }
        } catch (IOException e) {
            ConcurrentLog.logException(e);
        }

        return new Document[]{new Document(
                location,
                mimeType,
                charset,
                this,
                null,
                null,
                singleList(title),
                null,
                manifest == null ? "" : manifest.packageName,
                null,
                null,
                0.0d, 0.0d,
                sb.toString(),
                links,
                null,
                null,
                false,
                new Date())};
    }

    public static class AndroidManifestParser {
        // this is a simplified Android binary XML parser which reads
        // parts of the xml into metadata fields

        private boolean debug = false;
        public String versionCode = null;
        public String versionName = null;
        public String packageName = null;
        public String minSdkVersion = null;
        public String targetSdkVersion = null;
        public Set<String> permissions = new HashSet<>();
        public Set<String> actions = new HashSet<>();
        public Set<String> categories = new HashSet<>();

        public AndroidManifestParser(final byte[] xml, final boolean debug) {
            this.debug = debug;
            decompressXML(xml);
        }

        /**
         * code taken from
         * http://stackoverflow.com/questions/2097813/how-to-parse-the-androidmanifest-xml-file-inside-an-apk-package
         * original author: http://stackoverflow.com/users/539612/ribo
         * The author has taken the code snippet from his own application published
         * as "PackageExlorer", see https://play.google.com/store/apps/details?id=org.andr.pkgexp
         *
         * The code was adopted to produce a org.w3c.dom.Document data structure by [MC]
         *
         * documentation about binary xml can be found at:
         * http://justanapplication.wordpress.com/category/android/android-binary-xml/
         *
         * consider to replace this with one of
         * https://github.com/xiaxiaocao/apk-parser
         * http://code.google.com/p/axml/
         * https://github.com/joakime/android-apk-parser
         */

        private static final int endDocTag = 0x00100101;
        private static final int startTag = 0x00100102;
        private static final int endTag = 0x00100103;

        /**
         * Parse the 'compressed' binary form of Android XML docs such as for AndroidManifest.xml in .apk files
         * @param xml
         */
        private void decompressXML(byte[] xml) {
            // Compressed XML file/bytes starts with 24x bytes of data,
            // 9 32 bit words in little endian order (LSB first):
            // 0th word is 03 00 08 00
            // 3rd word SEEMS TO BE: Offset at then of StringTable
            // 4th word is: Number of strings in string table
            // WARNING: Sometime I indiscriminently display or refer to word in
            // little endian storage format, or in integer format (ie MSB first).
            int numbStrings = LEW(xml, 4 * 4);

            // StringIndexTable starts at offset 24x, an array of 32 bit LE offsets
            // of the length/string data in the StringTable.
            int sitOff = 0x24; // Offset of start of StringIndexTable

            // StringTable, each string is represented with a 16 bit little endian
            // character count, followed by that number of 16 bit (LE) (Unicode)
            // chars.
            int stOff = sitOff + numbStrings * 4; // StringTable follows StrIndexTable

            // XMLTags, The XML tag tree starts after some unknown content after the
            // StringTable. There is some unknown data after the StringTable, scan
            // forward from this point to the flag for the start of an XML start
            // tag.
            int xmlTagOff = LEW(xml, 3 * 4); // Start from the offset in the 3rd word.
            // Scan forward until we find the bytes: 0x02011000(x00100102 in normal int)
            for (int ii = xmlTagOff; ii < xml.length - 4; ii += 4) {
                if (LEW(xml, ii) == startTag) {
                    xmlTagOff = ii;
                    break;
                }
            } // end of hack, scanning for start of first start tag

            // XML tags and attributes:
            // Every XML start and end tag consists of 6 32 bit words:
            // 0th word: 02011000 for startTag and 03011000 for endTag
            // 1st word: a flag?, like 38000000
            // 2nd word: Line of where this tag appeared in the original source file
            // 3rd word: FFFFFFFF ??
            // 4th word: StringIndex of NameSpace name, or FFFFFFFF for default NS
            // 5th word: StringIndex of Element Name
            // (Note: 01011000 in 0th word means end of XML document, endDocTag)

            // Start tags (not end tags) contain 3 more words:
            // 6th word: 14001400 meaning??
            // 7th word: Number of Attributes that follow this tag(follow word 8th)
            // 8th word: 00000000 meaning??

            // Attributes consist of 5 words:
            // 0th word: StringIndex of Attribute Name's Namespace, or FFFFFFFF
            // 1st word: StringIndex of Attribute Name
            // 2nd word: StringIndex of Attribute Value, or FFFFFFF if ResourceId
            // used
            // 3rd word: Flags?
            // 4th word: str ind of attr value again, or ResourceId of value

            // TMP, dump string table to tr for debugging
            // tr.addSelect("strings", null);
            // for (int ii=0; ii<numbStrings; ii++) {
            // // Length of string starts at StringTable plus offset in StrIndTable
            // String str = compXmlString(xml, sitOff, stOff, ii);
            // tr.add(String.valueOf(ii), str);
            // }
            // tr.parent();

            // Step through the XML tree element tags and attributes
            int off = xmlTagOff;
            int indent = 0;
            //int startTagLineNo = -2;
            while (off < xml.length) {
                int tag0 = LEW(xml, off);
                // int tag1 = LEW(xml, off+1*4);
                //int lineNo = LEW(xml, off + 2 * 4);
                // int tag3 = LEW(xml, off+3*4);
                //int nameNsSi = LEW(xml, off + 4 * 4);
                int nameSi = LEW(xml, off + 5 * 4);

                if (tag0 == startTag) { // XML START TAG
                    //int tag6 = LEW(xml, off + 6 * 4); // Expected to be 14001400
                    int numbAttrs = LEW(xml, off + 7 * 4); // Number of Attributes
                                                           // to follow
                    // int tag8 = LEW(xml, off+8*4); // Expected to be 00000000
                    off += 9 * 4; // Skip over 6+3 words of startTag data
                    String name = compXmlString(xml, sitOff, stOff, nameSi);
                    // tr.addSelect(name, null);
                    //startTagLineNo = lineNo;

                    // Look for the Attributes
                    Map<String, String> attributes = new LinkedHashMap<>();
                    for (int ii = 0; ii < numbAttrs; ii++) {
                        //int attrNameNsSi = LEW(xml, off); // AttrName Namespace Str
                                                          // Ind, or FFFFFFFF
                        int attrNameSi = LEW(xml, off + 1 * 4); // AttrName String
                                                                // Index
                        int attrValueSi = LEW(xml, off + 2 * 4); // AttrValue Str
                                                                 // Ind, or FFFFFFFF
                        //int attrFlags = LEW(xml, off + 3 * 4);
                        int attrResId = LEW(xml, off + 4 * 4); // AttrValue
                                                               // ResourceId or dup
                                                               // AttrValue StrInd
                        off += 5 * 4; // Skip over the 5 words of an attribute

                        String attrName = compXmlString(xml, sitOff, stOff, attrNameSi);
                        String attrValue = attrValueSi != -1 ? compXmlString(xml, sitOff, stOff, attrValueSi) : "resourceID 0x" + Integer.toHexString(attrResId);
                        attributes.put(attrName, attrValue);
                        // tr.add(attrName, attrValue);
                    }
                    evaluateTag(indent, name, attributes);
                    indent++;
                } else if (tag0 == endTag) { // XML END TAG
                    indent--;
                    off += 6 * 4; // Skip over 6 words of endTag data
                    String name = compXmlString(xml, sitOff, stOff, nameSi);
                    evaluateTag(indent, name, null);
                    // tr.parent(); // Step back up the NobTree
                } else if (tag0 == endDocTag) { // END OF XML DOC TAG
                    break;
                } else {
                    // prt("  Unrecognized tag code '"+Integer.toHexString(tag0) +"' at offset "+off);
                    break;
                }
            }
        }

        public String compXmlString(byte[] xml, int sitOff, int stOff, int strInd) {
            if (strInd < 0) return null;
            int strOff = stOff + LEW(xml, sitOff + strInd * 4);
            return compXmlStringAt(xml, strOff);
        }

        public void evaluateTag(int indent, String tagName, Map<String, String> attributes) {
            if (this.debug) {
                StringBuilder sb = new StringBuilder(100);
                for (int i = 0; i < indent; i++) sb.append("  ");
                if (attributes == null) {
                    sb.append("</").append(tagName).append('>');
                } else {
                    sb.append('<').append(tagName);
                    for (Map.Entry<String, String> entry: attributes.entrySet()) {
                        sb.append(' ').append(entry.getKey()).append("=\"").append(entry.getValue()).append('\"');
                    }
                    sb.append('>');
                }
                //System.out.println(sb.toString());
            }

            // evaluate the content
            if (attributes != null) {
                if ("manifest".equals(tagName)) {
                    this.versionCode = attributes.get("versionCode");
                    this.versionName = attributes.get("versionName");
                    this.packageName = attributes.get("package");
                }
                if ("uses-sdk".equals(tagName)) {
                    this.minSdkVersion = attributes.get("minSdkVersion");
                    this.targetSdkVersion = attributes.get("targetSdkVersion");
                }
                if ("uses-permission".equals(tagName)) {
                    final String permission = attributes.get("name");
                    if (permission != null) this.permissions.add(permission);
                }
                if ("action".equals(tagName)) {
                    final String action = attributes.get("name");
                    if (action != null) this.actions.add(action);
                }
                if ("category".equals(tagName)) {
                    final String category = attributes.get("name");
                    if (category != null) this.categories.add(category);
                }
            }
        }

        /**
         * Return the string stored in StringTable format at offset strOff.
         * This offset points to the 16 bit string length, which
         * is followed by that number of 16 bit (Unicode) chars.
         * @param arr
         * @param strOff
         * @return
         */
        public String compXmlStringAt(byte[] arr, int strOff) {
            int strLen = arr[strOff + 1] << 8 & 0xff00 | arr[strOff] & 0xff;
            char[] chars = new char[strLen];
            for (int ii = 0; ii < strLen; ii++) {
                int p0 = strOff + 2 + ii * 2;
                if (p0 >= arr.length - 1) break; // this should never happen if the compressed xml is well-formed, but some are not(!)
                chars[ii] = (char) (((arr[p0 + 1] & 0x00FF) << 8) + (arr[p0] & 0x00FF));
            }
            return new String(chars);
        }

        /**
         * Return value of a Little Endian 32 bit word from the byte array at offset off.
         * @param arr
         * @param off
         * @return
         */
        public int LEW(byte[] arr, int off) {
            return arr[off + 3] << 24 & 0xff000000 | arr[off + 2] << 16 & 0xff0000 | arr[off + 1] << 8 & 0xff00 | arr[off] & 0xFF;
        } // end of LEW

    }

    /**
     * this arsc parser is far away from being correct, it's just a hack
     * @param arscStream a stream from the arsc content
     * @return a list of resource strings
     * @throws IOException
     */
    public static List<String> resourcesArscParser(InputStream arscStream) throws IOException {
        final byte[] asa = new byte[arscStream.available()];
        arscStream.read(asa);
        int pos = 0;
        final Charset charset = StandardCharsets.UTF_8;
        final List<String> s = new ArrayList<>();
        parseloop: while (pos < asa.length) {
            while (pos < asa.length && asa[pos] != 0) pos++;
            if (pos + 2 >= asa.length) break parseloop;
            // the next two bytes are counters:
            // the first counts the number of characters
            // the second counts the number of bytes (which may be greater)
            int charcount = asa[++pos];
            if (charcount == 0) continue parseloop;
            int bytecount = asa[++pos];
            if (bytecount == 0) continue parseloop;
            pos++;
            if (bytecount < charcount) continue parseloop;
            if (pos + bytecount + 1 > asa.length) break parseloop;
            if (asa[pos + bytecount] != 0) {pos++; continue parseloop;} // must be terminated by 0
            for (int i = pos; i < pos + bytecount; i++) if (asa[i] == 0) {pos++; continue parseloop;} // must not contain a 0
            String t = new String(asa, pos, bytecount, charset);
            if (t.length() == charcount) s.add(t);
            pos += bytecount;
        }
        return s;
    }

    public static void main(String[] args) {
        System.out.println("apk parser test with file " + args[0]);
        System.out.println();
        System.out.println("File list:");
        try {
            JarFile jf = new JarFile(args[0]);
            Enumeration<JarEntry> e = jf.entries();
            while (e.hasMoreElements()) {
                String path = e.nextElement().toString();
                System.out.println(path);
            }
            System.out.println();
            System.out.println("AndroidManifest.xml:");
            InputStream is = jf.getInputStream(jf.getEntry("AndroidManifest.xml"));
            byte[] xml = new byte[is.available()];
            is.read(xml);
            @SuppressWarnings("unused")
            AndroidManifestParser manifest = new AndroidManifestParser(xml, true);

            System.out.println();
            System.out.println("resources.arsc:");
            is = jf.getInputStream(jf.getEntry("resources.arsc"));
            List<String> resources = resourcesArscParser(is);
            for (String s: resources) {
                System.out.println(s);
            }
        } catch (Exception e) {
            e.printStackTrace();
        }
        System.exit(1);
    }

}