// YMarkDMOZImporter.java
// (C) 2012 by Stefan Foerster (apfelmaennchen), sof@gmx.de, Norderstedt, Germany
// first published 2012 on http://yacy.net
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// $LastChangedDate$
// $LastChangedRevision$
// $LastChangedBy$
//
// LICENSE
// 
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA

package net.yacy.data.ymark;

import net.yacy.cora.lod.vocabulary.DMOZ;
import net.yacy.cora.lod.vocabulary.DublinCore;

import org.xml.sax.Attributes;
import org.xml.sax.InputSource;
import org.xml.sax.SAXException;
import org.xml.sax.XMLReader;
import org.xml.sax.helpers.DefaultHandler;
import org.xml.sax.helpers.XMLReaderFactory;

public class YMarkDMOZImporter extends YMarkImporter {
    // Statics
	public static String IMPORTER = "DMOZ";
	
    // Importer Variables
	private final XMLReader xmlReader;
	private int depth;
	
	public YMarkDMOZImporter(final MonitoredReader dmoz_file, final int queueSize, final String targetFolder, final String sourceFolder) throws SAXException {
		super(dmoz_file, queueSize, sourceFolder, targetFolder);
		setImporter(IMPORTER);
		this.xmlReader = XMLReaderFactory.createXMLReader();
        this.xmlReader.setFeature(XML_NAMESPACE_PREFIXES, false);
        this.xmlReader.setFeature(XML_NAMESPACES, false);
        this.xmlReader.setFeature(XML_VALIDATION, false);
		this.xmlReader.setContentHandler(new DMOZParser());
		this.depth = Integer.MAX_VALUE;
	}
	
	public void parse() throws Exception {
		xmlReader.parse(new InputSource(bmk_file));
	}
	
	public void setDepth(int d) {
		this.depth = d + this.targetFolder.split(YMarkUtil.FOLDERS_SEPARATOR).length-1;
	}
	
	public class DMOZParser extends DefaultHandler {
		
		private YMarkEntry bmk;
		private boolean	isNewEntry;
		private boolean isSubtopic;
		private String tag;
		private final StringBuilder buffer;
		
		public DMOZParser() {
			this.bmk = new YMarkEntry();
			this.isNewEntry = false;
			this.isSubtopic = false;
			this.buffer = new StringBuilder(512);
		}
		
		public void startElement(final String uri, String localName, final String qName, final Attributes attributes) throws SAXException {
			// get rid of namespace prefixes
			if (localName.isEmpty()) {
				localName = qName.substring(qName.indexOf(':')+1);
			}
			this.tag = null;    	
	    	if (localName.equals(DMOZ.ExternalPage.name())) {
    			this.bmk = new YMarkEntry();
    			this.bmk.put(YMarkEntry.BOOKMARK.URL.key(), attributes.getValue(0));
            	this.isNewEntry = true;
    		}
    		if(isNewEntry && localName.equals(DublinCore.Title.name())) {
    			this.tag = YMarkEntry.BOOKMARK.TITLE.key();
    		}
    		if(isNewEntry && localName.equals(DublinCore.Description.name())) {
    			this.tag = YMarkEntry.BOOKMARK.DESC.key();
    		}
    		if(isNewEntry && localName.equals(DMOZ.topic.name())) {
    			this.tag = YMarkEntry.BOOKMARK.FOLDERS.key();
				buffer.append(targetFolder);
				buffer.append(YMarkUtil.FOLDERS_SEPARATOR);
    		}		    
		}

		public void endElement(final String uri, String localName, final String qName) throws SAXException {
			// get rid of namespace prefixes
			if (localName.isEmpty()) {
				localName = qName.substring(qName.indexOf(':')+1);
			}
			if (this.isNewEntry && this.isSubtopic && localName.equals(DMOZ.ExternalPage.name())) {
                try {
                	bookmarks.put(this.bmk);
                } catch (InterruptedException e) {
                    e.printStackTrace();
                } finally {
        			this.isSubtopic = false;
        			this.isNewEntry = false;
                }
    		} else if(localName.equals(DMOZ.topic.name())) {
    			int d = 0;
    			for(int i=0; i<this.buffer.length(); i++) {
    				if (this.buffer.charAt(i) == '/') {
    					d++;
    					if (d > depth) {
    						this.buffer.setLength(i);
    						break;
    					}
    				}
    			}
    			if (this.buffer.substring(targetFolder.length()+1).startsWith(sourceFolder)) {
	    			this.isSubtopic = true;
	    			this.bmk.put(this.tag, YMarkUtil.cleanFoldersString(buffer));
				} else {
					this.isSubtopic = false;
					this.isNewEntry = false;
				}
    		} else if (this.tag != null) {
    			this.bmk.put(this.tag, buffer.toString());
    		}
			this.tag = null;
			this.buffer.setLength(0);
		}

		public void characters(final char ch[], final int start, final int length) throws SAXException {
			// no processing here, as the SAX Parser characters method could be called more than once per tag!
			if(this.tag != null) {				
				buffer.append(ch, start, length); 
			}
		}
	}
}