/**
* Evaluation
* Copyright 2011 by Michael Peter Christen
* First released 28.04.2011 at http://yacy.net
*
* $LastChangedDate: 2011-03-08 02:51:51 +0100 (Di, 08 Mrz 2011) $
* $LastChangedRevision: 7567 $
* $LastChangedBy: low012 $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see .
*/
package net.yacy.document.parser.html;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.util.MemoryControl;
/*
* This class provides methods to use a pattern analysis for html files
* The pattern analysis is generic and can be configured using a field-name/pattern property
* configuration file.
* Such a configuration file has names of the structure
* _
* and values are regular java expressions
* A html file is scanned for pattern matchings within a specific
* and if such a matching can be found then the is collected as
* subject for the scanned document
* patternProperties files must have special file names where the file name
* starts with the word "parser." and must end with ".properties"
* everything between this is a name for a solr multi-value field where
* the collected subject names are stored to
*/
public class Evaluation {
private static List models = new ArrayList(); // the list of all models that shall be applied
public static enum Element {
text,
title,
bodyclass,
divid,
csspath,
metagenerator,
url,
scriptpath,
scriptcode,
framepath,
iframepath,
imgpath,
apath,
comment;
}
private static class Attribute {
public String subject; // the name of the attribute
public Pattern pattern; // the pattern that must match for that attribute
public Attribute(final String subject, final Pattern pattern) {
this.subject = subject;
this.pattern = pattern;
}
@Override
public String toString() {
return this.subject + ":" + this.pattern.toString();
}
}
private static class Model {
private final String modelName;
private final Map> elementMatcher; // a mapping from element-names to lists of Attributes
public Model(final File patternProperties) throws IOException {
if (!patternProperties.exists()) throw new IOException("File does not exist: " + patternProperties);
final String name = patternProperties.getName();
if (!name.startsWith("parser.")) throw new IOException("file name must start with 'parser.': " + name);
if (!name.endsWith(".properties")) throw new IOException("file name must end with '.properties': " + name);
this.modelName = name.substring(7, name.length() - 11);
if (this.modelName.length() < 1) throw new IOException("file name too short: " + name);
// load the file
final Properties p = new Properties();
p.load(new FileReader(patternProperties));
// iterate through the properties and generate method patterns
this.elementMatcher = new HashMap>();
String subject, elementName;
Element element;
Pattern pattern;
for (final Map.Entry