You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
yacy_search_server/source/net/yacy/document/parser/html/Evaluation.java

253 lines
9.5 KiB

/**
* Evaluation
* Copyright 2011 by Michael Peter Christen
* First released 28.04.2011 at http://yacy.net
*
* $LastChangedDate: 2011-03-08 02:51:51 +0100 (Di, 08 Mrz 2011) $
* $LastChangedRevision: 7567 $
* $LastChangedBy: low012 $
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.document.parser.html;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Properties;
import java.util.Set;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
13 years ago
import net.yacy.cora.sorting.ClusteredScoreMap;
import net.yacy.cora.util.ConcurrentLog;
import net.yacy.kelondro.util.MemoryControl;
/*
* This class provides methods to use a pattern analysis for html files
* The pattern analysis is generic and can be configured using a field-name/pattern property
* configuration file.
* Such a configuration file has names of the structure
* <subject-name>_<document-element>
* and values are regular java expressions
* A html file is scanned for pattern matchings within a specific <document-element>
* and if such a matching can be found then the <attribute-name> is collected as
* subject for the scanned document
* patternProperties files must have special file names where the file name
* starts with the word "parser." and must end with ".properties"
* everything between this is a name for a solr multi-value field where
* the collected subject names are stored to
*/
public class Evaluation {
private static List<Model> models = new ArrayList<Model>(); // the list of all models that shall be applied
public static enum Element {
text,
title,
bodyclass,
divid,
csspath,
metagenerator,
url,
scriptpath,
scriptcode,
framepath,
iframepath,
imgpath,
apath,
comment;
}
private static class Attribute {
public String subject; // the name of the attribute
public Pattern pattern; // the pattern that must match for that attribute
public Attribute(final String subject, final Pattern pattern) {
this.subject = subject;
this.pattern = pattern;
}
@Override
public String toString() {
return this.subject + ":" + this.pattern.toString();
}
}
private static class Model {
private final String modelName;
private final Map<Element, List<Attribute>> elementMatcher; // a mapping from element-names to lists of Attributes
public Model(final File patternProperties) throws IOException {
if (!patternProperties.exists()) throw new IOException("File does not exist: " + patternProperties);
final String name = patternProperties.getName();
if (!name.startsWith("parser.")) throw new IOException("file name must start with 'parser.': " + name);
if (!name.endsWith(".properties")) throw new IOException("file name must end with '.properties': " + name);
this.modelName = name.substring(7, name.length() - 11);
if (this.modelName.length() < 1) throw new IOException("file name too short: " + name);
// load the file
final Properties p = new Properties();
p.load(new FileReader(patternProperties));
// iterate through the properties and generate method patterns
this.elementMatcher = new HashMap<Element, List<Attribute>>();
String subject, elementName;
Element element;
Pattern pattern;
for (final Map.Entry<Object, Object> entry: p.entrySet()) {
final String k = (String) entry.getKey();
final String v = (String) entry.getValue();
final int w = k.indexOf('_');
if (w < 0) {
ConcurrentLog.severe("PatternAnalysis", "wrong configuration in " + name + ": separator '_' missing: " + k);
continue;
}
subject = k.substring(0, w);
elementName = k.substring(w + 1);
try {
pattern = Pattern.compile(v);
} catch (final PatternSyntaxException e) {
ConcurrentLog.severe("PatternAnalysis", "bad pattern in " + name + ": '" + k + "=" + v + "' - " + e.getDescription());
continue;
}
element = Element.valueOf(elementName);
if (element == null) {
ConcurrentLog.severe("PatternAnalysis", "unknown element in " + name + ": " + elementName);
continue;
}
List<Attribute> attributeList = this.elementMatcher.get(element);
if (attributeList == null) {
attributeList = new ArrayList<Attribute>();
this.elementMatcher.put(element, attributeList);
}
attributeList.add(new Attribute(subject, pattern));
}
}
public String getName() {
return this.modelName;
}
/**
* match elementContents for a specific elementName
* @param element - the name of the element as Element enum type
* @param content - the content of the element
* @return a list of subject names that match with the element
*/
public ClusteredScoreMap<String> match(final Element element, final CharSequence content) {
added a new way of content browsing in search results: - date navigation The date is taken from the CONTENT of the documents / web pages, NOT from a date submitted in the context of metadata (i.e. http header or html head form). This makes it possible to search for documents in the future, i.e. when documents contain event descriptions for future events. The date is written to an index field which is now enabled by default. All documents are scanned for contained date mentions. To visualize the dates for a specific search results, a histogram showing the number of documents for each day is displayed. To render these histograms the morris.js library is used. Morris.js requires also raphael.js which is now also integrated in YaCy. The histogram is now also displayed in the index browser by default. To select a specific range from a search result, the following modifiers had been introduced: from:<date> to:<date> These modifiers can be used separately (i.e. only 'from' or only 'to') to describe an open interval or combined to have a closed interval. Both dates are inclusive. To select a specific single date only, use the 'to:' - modifier. The histogram shows blue and green lines; the green lines denot weekend days (saturday and sunday). Clicking on bars in the histogram has the following reaction: 1st click: add a from:<date> modifier for the date of the bar 2nd click: add a to:<date> modifier for the date of the bar 3rd click: remove from and date modifier and set a on:<date> for the bar When the on:<date> modifier is used, the histogram shows an unlimited time period. This makes it possible to click again (4th click) which is then interpreted as a 1st click again (sets a from modifier). The display feature is NOT switched on by default; to switch it on use the /ConfigSearchPage_p.html servlet.
10 years ago
final ClusteredScoreMap<String> subjects = new ClusteredScoreMap<String>(false);
final List<Attribute> patterns = this.elementMatcher.get(element);
if (patterns == null) return subjects;
for (final Attribute attribute: patterns) {
if (attribute.pattern.matcher(content).matches()) {
subjects.inc(attribute.subject);
}
}
return subjects;
}
@Override
public String toString() {
return this.modelName + ":" + this.elementMatcher.toString();
}
}
private final Map<String, ClusteredScoreMap<String>> modelMap; // a map from model names to attribute scores
public Evaluation() {
this.modelMap = new HashMap<String, ClusteredScoreMap<String>>();
}
@Override
public String toString() {
return this.modelMap.toString();
}
/**
* produce all model names
* @return a set of model names
*/
public Set<String> getModelNames() {
return this.modelMap.keySet();
}
/**
* calculate the scores for a model
* the scores is a attribute/count map which count how often a specific attribute was found
* @param modelName
* @return
*/
public ClusteredScoreMap<String> getScores(final String modelName) {
return this.modelMap.get(modelName);
}
/**
* add a model to the evaluation set
* @param f
* @throws IOException
*/
public static void add(final File f) throws IOException {
final Model pattern = new Model(f);
models.add(pattern);
}
/**
* match some content within a specific element
* this will increase statistic counters for models if a model matches
* @param element - the element where a matching is made
* @param content - the content of the element which shall be matched
*/
public void match(final Element element, final CharSequence content) {
if (models.isEmpty()) return; // fast return if this feature is not used
ClusteredScoreMap<String> newScores, oldScores;
for (final Model pattern: models) {
newScores = pattern.match(element, content);
oldScores = getScores(pattern.getName());
if (oldScores == null) {
added a new way of content browsing in search results: - date navigation The date is taken from the CONTENT of the documents / web pages, NOT from a date submitted in the context of metadata (i.e. http header or html head form). This makes it possible to search for documents in the future, i.e. when documents contain event descriptions for future events. The date is written to an index field which is now enabled by default. All documents are scanned for contained date mentions. To visualize the dates for a specific search results, a histogram showing the number of documents for each day is displayed. To render these histograms the morris.js library is used. Morris.js requires also raphael.js which is now also integrated in YaCy. The histogram is now also displayed in the index browser by default. To select a specific range from a search result, the following modifiers had been introduced: from:<date> to:<date> These modifiers can be used separately (i.e. only 'from' or only 'to') to describe an open interval or combined to have a closed interval. Both dates are inclusive. To select a specific single date only, use the 'to:' - modifier. The histogram shows blue and green lines; the green lines denot weekend days (saturday and sunday). Clicking on bars in the histogram has the following reaction: 1st click: add a from:<date> modifier for the date of the bar 2nd click: add a to:<date> modifier for the date of the bar 3rd click: remove from and date modifier and set a on:<date> for the bar When the on:<date> modifier is used, the histogram shows an unlimited time period. This makes it possible to click again (4th click) which is then interpreted as a 1st click again (sets a from modifier). The display feature is NOT switched on by default; to switch it on use the /ConfigSearchPage_p.html servlet.
10 years ago
oldScores = new ClusteredScoreMap<String>(false);
this.modelMap.put(pattern.getName(), oldScores);
}
oldScores.inc(newScores);
}
}
public void match(final Element element, final char[] content) {
if (models.isEmpty()) return; // fast return if this feature is not used
if (MemoryControl.request(content.length * 2, false)) {
match(element, new String(content) /*Segment(content, 0, content.length)*/);
}
}
public static void main(String[] args) {
String t =
"// [CDATA[\n" +
"var gaJsH xost = ((\"https:\" == document.location.protocol) ? \"https://ssl.\" : \"http://www.\");\n" +
"document.write(unescape(\"%3Cscript xsrc='\" + gaJ sHost + \"google-analytics.com/ga.js'\n" +
"type='text/javascript'%3E%3C/script%3E\"));\"\n" +
"\"// ]]\"";
Pattern p = Pattern.compile("(?s).*gaJsHost.*|(?s).*_gat._anonymizeIp.*");
if (p.matcher(t).matches()) System.out.println("1");
}
}