You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
937 lines
40 KiB
937 lines
40 KiB
// plasmaCondenser.java
|
|
// -----------------------
|
|
// part of YaCy
|
|
// (C) by Michael Peter Christen; mc@anomic.de
|
|
// first published on http://www.anomic.de
|
|
// Frankfurt, Germany, 2004
|
|
// last change: 09.01.2004
|
|
//
|
|
// This program is free software; you can redistribute it and/or modify
|
|
// it under the terms of the GNU General Public License as published by
|
|
// the Free Software Foundation; either version 2 of the License, or
|
|
// (at your option) any later version.
|
|
//
|
|
// This program is distributed in the hope that it will be useful,
|
|
// but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
// GNU General Public License for more details.
|
|
//
|
|
// You should have received a copy of the GNU General Public License
|
|
// along with this program; if not, write to the Free Software
|
|
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
|
//
|
|
// Using this software in any meaning (reading, learning, copying, compiling,
|
|
// running) means that you agree that the Author(s) is (are) not responsible
|
|
// for cost, loss of data or any harm that may be caused directly or indirectly
|
|
// by usage of this softare or this documentation. The usage of this software
|
|
// is on your own risk. The installation and usage (starting/running) of this
|
|
// software may allow other people or application to access your computer and
|
|
// any attached devices and is highly dependent on the configuration of the
|
|
// software which must be done by the user of the software; the author(s) is
|
|
// (are) also not responsible for proper configuration and usage of the
|
|
// software, even if provoked by documentation provided together with
|
|
// the software.
|
|
//
|
|
// Any changes to this file according to the GPL as documented in the file
|
|
// gpl.txt aside this file in the shipment you received can be done to the
|
|
// lines that follows this copyright notice here, but changes must not be
|
|
// done inside the copyright notive above. A re-distribution must contain
|
|
// the intact and unchanged copyright notice.
|
|
// Contributions and changes to the program code must be marked as such.
|
|
|
|
// compile with javac -sourcepath source source/de/anomic/plasma/plasmaCondenser.java
|
|
// execute with java -cp source de.anomic.plasma.plasmaCondenser
|
|
|
|
package de.anomic.plasma;
|
|
|
|
import java.io.BufferedReader;
|
|
import java.io.ByteArrayInputStream;
|
|
import java.io.File;
|
|
import java.io.FileInputStream;
|
|
import java.io.FileNotFoundException;
|
|
import java.io.IOException;
|
|
import java.io.InputStream;
|
|
import java.io.InputStreamReader;
|
|
import java.io.RandomAccessFile;
|
|
import java.io.Reader;
|
|
import java.io.UnsupportedEncodingException;
|
|
import java.util.Enumeration;
|
|
import java.util.HashMap;
|
|
import java.util.HashSet;
|
|
import java.util.Iterator;
|
|
import java.util.Map;
|
|
import java.util.Properties;
|
|
import java.util.Set;
|
|
import java.util.TreeMap;
|
|
import java.util.TreeSet;
|
|
|
|
import de.anomic.htmlFilter.htmlFilterContentScraper;
|
|
import de.anomic.htmlFilter.htmlFilterImageEntry;
|
|
import de.anomic.index.indexRWIEntry;
|
|
import de.anomic.kelondro.kelondroBase64Order;
|
|
import de.anomic.kelondro.kelondroBitfield;
|
|
import de.anomic.kelondro.kelondroMSetTools;
|
|
import de.anomic.server.serverCodings;
|
|
import de.anomic.yacy.yacySeedDB;
|
|
import de.anomic.yacy.yacyURL;
|
|
|
|
public final class plasmaCondenser {
|
|
|
|
// this is the page analysis class
|
|
|
|
// category flags that show how the page can be distinguished in different interest groups
|
|
public static final int flag_cat_indexof = 0; // a directory listing page (i.e. containing 'index of')
|
|
public static final int flag_cat_opencontent = 1; // open source, any free stuff
|
|
public static final int flag_cat_business = 2; // web shops, marketing, trade
|
|
public static final int flag_cat_stockfinance = 3; // stock exchange (quotes), finance, economy
|
|
public static final int flag_cat_health = 4; // health
|
|
public static final int flag_cat_sport = 5; // any sport, cars etc.
|
|
public static final int flag_cat_lifestyle = 6; // travel, lifestyle
|
|
public static final int flag_cat_politics = 7; // politics
|
|
public static final int flag_cat_news = 8; // blogs, news pages
|
|
public static final int flag_cat_children = 9; // toys, childrens education, help for parents
|
|
public static final int flag_cat_entertainment = 10; // boulevard, entertainment, cultural content
|
|
public static final int flag_cat_knowledge = 11; // science, school stuff, help for homework
|
|
public static final int flag_cat_computer = 12; // any computer related stuff, networks, operation systems
|
|
public static final int flag_cat_p2p = 13; // p2p support, file-sharing archives etc.
|
|
public static final int flag_cat_sex = 14; // sexual content
|
|
public static final int flag_cat_spam = 15; // pages that anybody would consider as not interesting
|
|
public static final int flag_cat_linux = 16; // pages about linux software
|
|
public static final int flag_cat_macos = 17; // pages about macintosh, apple computers and the mac os
|
|
public static final int flag_cat_windows = 18; // pages about windows os and software
|
|
public static final int flag_cat_osreserve = 19; // reserve
|
|
public static final int flag_cat_hasimage = 20; // the page refers to (at least one) images
|
|
public static final int flag_cat_hasaudio = 21; // the page refers to (at least one) audio file
|
|
public static final int flag_cat_hasvideo = 22; // the page refers to (at least one) videos
|
|
public static final int flag_cat_hasapp = 23; // the page refers to (at least one) application file
|
|
|
|
private final static int numlength = 5;
|
|
|
|
//private Properties analysis;
|
|
private TreeMap<String, wordStatProp> words; // a string (the words) to (wordStatProp) - relation
|
|
private HashMap<StringBuffer, phraseStatProp> sentences;
|
|
private int wordminsize;
|
|
private int wordcut;
|
|
|
|
//public int RESULT_NUMB_TEXT_BYTES = -1;
|
|
public int RESULT_NUMB_WORDS = -1;
|
|
public int RESULT_DIFF_WORDS = -1;
|
|
public int RESULT_NUMB_SENTENCES = -1;
|
|
public int RESULT_DIFF_SENTENCES = -1;
|
|
public kelondroBitfield RESULT_FLAGS = new kelondroBitfield(4);
|
|
|
|
public plasmaCondenser(plasmaParserDocument document, boolean indexText, boolean indexMedia) throws UnsupportedEncodingException {
|
|
// if addMedia == true, then all the media links are also parsed and added to the words
|
|
// added media words are flagged with the appropriate media flag
|
|
this.wordminsize = 3;
|
|
this.wordcut = 2;
|
|
this.words = new TreeMap<String, wordStatProp>();
|
|
this.sentences = new HashMap<StringBuffer, phraseStatProp>();
|
|
this.RESULT_FLAGS = new kelondroBitfield(4);
|
|
|
|
//System.out.println("DEBUG: condensing " + document.getMainLongTitle() + ", indexText=" + Boolean.toString(indexText) + ", indexMedia=" + Boolean.toString(indexMedia));
|
|
|
|
insertTextToWords(document.dc_source().toNormalform(false, true), 0, indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS);
|
|
|
|
Map.Entry<yacyURL, String> entry;
|
|
if (indexText) {
|
|
createCondensement(document.getText(), document.getCharset());
|
|
// the phrase counter:
|
|
// phrase 0 are words taken from the URL
|
|
// phrase 1 is the MainTitle
|
|
// phrase 2 is <not used>
|
|
// phrase 3 is the Document Abstract
|
|
// phrase 4 is the Document Author
|
|
// phrase 5 are the tags specified in document
|
|
// phrase 10 and above are the section headlines/titles (88 possible)
|
|
// phrase 98 is taken from the embedded anchor/hyperlinks description
|
|
// phrase 99 is taken from the media Link url and anchor description
|
|
// phrase 100 and above are lines from the text
|
|
|
|
insertTextToWords(document.dc_title(), 1, indexRWIEntry.flag_app_dc_title, RESULT_FLAGS);
|
|
insertTextToWords(document.dc_description(), 3, indexRWIEntry.flag_app_dc_description, RESULT_FLAGS);
|
|
insertTextToWords(document.dc_creator(), 4, indexRWIEntry.flag_app_dc_creator, RESULT_FLAGS);
|
|
// missing: tags!
|
|
String[] titles = document.getSectionTitles();
|
|
for (int i = 0; i < titles.length; i++) {
|
|
insertTextToWords(titles[i], i + 10, indexRWIEntry.flag_app_emphasized, RESULT_FLAGS);
|
|
}
|
|
|
|
// anchors
|
|
Iterator<Map.Entry<yacyURL, String>> i = document.getAnchors().entrySet().iterator();
|
|
while (i.hasNext()) {
|
|
entry = i.next();
|
|
if ((entry == null) || (entry.getKey() == null)) continue;
|
|
insertTextToWords(entry.getKey().toNormalform(false, false), 98, indexRWIEntry.flag_app_dc_identifier, RESULT_FLAGS);
|
|
insertTextToWords((String) entry.getValue(), 98, indexRWIEntry.flag_app_dc_description, RESULT_FLAGS);
|
|
}
|
|
} else {
|
|
this.RESULT_NUMB_WORDS = 0;
|
|
this.RESULT_DIFF_WORDS = 0;
|
|
this.RESULT_NUMB_SENTENCES = 0;
|
|
this.RESULT_DIFF_SENTENCES = 0;
|
|
}
|
|
|
|
if (indexMedia) {
|
|
// audio
|
|
Iterator<Map.Entry<yacyURL, String>> i = document.getAudiolinks().entrySet().iterator();
|
|
while (i.hasNext()) {
|
|
entry = i.next();
|
|
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasaudio, RESULT_FLAGS);
|
|
insertTextToWords((String) entry.getValue(), 99, flag_cat_hasaudio, RESULT_FLAGS);
|
|
}
|
|
|
|
// video
|
|
i = document.getVideolinks().entrySet().iterator();
|
|
while (i.hasNext()) {
|
|
entry = i.next();
|
|
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasvideo, RESULT_FLAGS);
|
|
insertTextToWords((String) entry.getValue(), 99, flag_cat_hasvideo, RESULT_FLAGS);
|
|
}
|
|
|
|
// applications
|
|
i = document.getApplinks().entrySet().iterator();
|
|
while (i.hasNext()) {
|
|
entry = i.next();
|
|
insertTextToWords(entry.getKey().toNormalform(false, false), 99, flag_cat_hasapp, RESULT_FLAGS);
|
|
insertTextToWords((String) entry.getValue(), 99, flag_cat_hasapp, RESULT_FLAGS);
|
|
}
|
|
|
|
// images
|
|
Iterator<htmlFilterImageEntry> j = document.getImages().iterator();
|
|
htmlFilterImageEntry ientry;
|
|
while (j.hasNext()) {
|
|
ientry = j.next();
|
|
insertTextToWords(ientry.url().toNormalform(false, false), 99, flag_cat_hasimage, RESULT_FLAGS);
|
|
insertTextToWords(ientry.alt(), 99, flag_cat_hasimage, RESULT_FLAGS);
|
|
}
|
|
|
|
// finally check all words for missing flag entry
|
|
Iterator<Map.Entry<String, wordStatProp>> k = words.entrySet().iterator();
|
|
wordStatProp wprop;
|
|
Map.Entry<String, wordStatProp> we;
|
|
while (k.hasNext()) {
|
|
we = k.next();
|
|
wprop = we.getValue();
|
|
if (wprop.flags == null) {
|
|
wprop.flags = (kelondroBitfield) RESULT_FLAGS.clone();
|
|
words.put(we.getKey(), wprop);
|
|
}
|
|
}
|
|
}
|
|
|
|
// construct flag set for document
|
|
if (document.getImages().size() > 0) RESULT_FLAGS.set(flag_cat_hasimage, true);
|
|
if (document.getAudiolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasaudio, true);
|
|
if (document.getVideolinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasvideo, true);
|
|
if (document.getApplinks().size() > 0) RESULT_FLAGS.set(flag_cat_hasapp, true);
|
|
}
|
|
|
|
private void insertTextToWords(String text, int phrase, int flagpos, kelondroBitfield flagstemplate) {
|
|
String word;
|
|
wordStatProp wprop;
|
|
sievedWordsEnum wordenum;
|
|
try {
|
|
wordenum = new sievedWordsEnum(new ByteArrayInputStream(text.getBytes()), "UTF-8", 3);
|
|
} catch (UnsupportedEncodingException e) {
|
|
return;
|
|
}
|
|
int pip = 0;
|
|
while (wordenum.hasMoreElements()) {
|
|
word = (new String((StringBuffer) wordenum.nextElement())).toLowerCase();
|
|
wprop = (wordStatProp) words.get(word);
|
|
if (wprop == null) wprop = new wordStatProp(0, pip, phrase);
|
|
if (wprop.flags == null) wprop.flags = (kelondroBitfield) flagstemplate.clone();
|
|
wprop.flags.set(flagpos, true);
|
|
words.put(word, wprop);
|
|
pip++;
|
|
this.RESULT_NUMB_WORDS++;
|
|
this.RESULT_DIFF_WORDS++;
|
|
}
|
|
}
|
|
|
|
public plasmaCondenser(InputStream text, String charset) throws UnsupportedEncodingException {
|
|
this(text, charset, 3, 2);
|
|
}
|
|
|
|
public plasmaCondenser(InputStream text, String charset, int wordminsize, int wordcut) throws UnsupportedEncodingException {
|
|
this.wordminsize = wordminsize;
|
|
this.wordcut = wordcut;
|
|
// analysis = new Properties();
|
|
words = new TreeMap<String, wordStatProp>();
|
|
sentences = new HashMap<StringBuffer, phraseStatProp>();
|
|
createCondensement(text, charset);
|
|
}
|
|
|
|
// create a word hash
|
|
public static final String word2hash(String word) {
|
|
return kelondroBase64Order.enhancedCoder.encode(serverCodings.encodeMD5Raw(word.toLowerCase())).substring(0, yacySeedDB.commonHashLength);
|
|
}
|
|
|
|
public static final Set<String> words2hashSet(String[] words) {
|
|
TreeSet<String> hashes = new TreeSet<String>(kelondroBase64Order.enhancedComparator);
|
|
for (int i = 0; i < words.length; i++) hashes.add(word2hash(words[i]));
|
|
return hashes;
|
|
}
|
|
|
|
public static final String words2hashString(String[] words) {
|
|
StringBuffer sb = new StringBuffer();
|
|
for (int i = 0; i < words.length; i++) sb.append(word2hash(words[i]));
|
|
return new String(sb);
|
|
}
|
|
|
|
public static final TreeSet<String> words2hashes(Set<String> words) {
|
|
Iterator<String> i = words.iterator();
|
|
TreeSet<String> hashes = new TreeSet<String>(kelondroBase64Order.enhancedComparator);
|
|
while (i.hasNext()) hashes.add(word2hash(i.next()));
|
|
return hashes;
|
|
}
|
|
|
|
public int excludeWords(TreeSet<String> stopwords) {
|
|
// subtracts the given stopwords from the word list
|
|
// the word list shrinkes. This returns the number of shrinked words
|
|
int oldsize = words.size();
|
|
words = kelondroMSetTools.excludeConstructive(words, stopwords);
|
|
return oldsize - words.size();
|
|
}
|
|
|
|
public Map<String, wordStatProp> words() {
|
|
// returns the words as word/wordStatProp relation map
|
|
return words;
|
|
}
|
|
|
|
public Map<StringBuffer, phraseStatProp> sentences() {
|
|
return sentences;
|
|
}
|
|
|
|
public static class wordStatProp {
|
|
// object carries statistics for words and sentences
|
|
|
|
public int count; // number of occurrences
|
|
public int posInText; // unique handle, is initialized with word position (excluding double occurring words)
|
|
public int posInPhrase; // position of word in phrase
|
|
public int numOfPhrase; // number of phrase. 'normal' phrases begin with number 100
|
|
private HashSet<Integer> hash; // a set of handles to all sentences where this word appears
|
|
public kelondroBitfield flags; // the flag bits for each word
|
|
|
|
public wordStatProp(int handle, int pip, int nop) {
|
|
this.count = 1;
|
|
this.posInText = handle;
|
|
this.posInPhrase = pip;
|
|
this.numOfPhrase = nop;
|
|
this.hash = new HashSet<Integer>();
|
|
this.flags = null;
|
|
}
|
|
|
|
public void inc() {
|
|
count++;
|
|
}
|
|
|
|
public void check(int i) {
|
|
hash.add(new Integer(i));
|
|
}
|
|
|
|
}
|
|
|
|
public static class phraseStatProp {
|
|
// object carries statistics for words and sentences
|
|
|
|
public int count; // number of occurrences
|
|
public int handle; // unique handle, is initialized with sentence counter
|
|
private HashSet<Integer> hash; //
|
|
|
|
public phraseStatProp(int handle) {
|
|
this.count = 1;
|
|
this.handle = handle;
|
|
this.hash = new HashSet<Integer>();
|
|
}
|
|
|
|
public void inc() {
|
|
count++;
|
|
}
|
|
|
|
public void check(int i) {
|
|
hash.add(new Integer(i));
|
|
}
|
|
|
|
}
|
|
|
|
|
|
public String intString(int number, int length) {
|
|
String s = Integer.toString(number);
|
|
while (s.length() < length) s = "0" + s;
|
|
return s;
|
|
}
|
|
|
|
private void createCondensement(InputStream is, String charset) throws UnsupportedEncodingException {
|
|
HashSet<String> currsentwords = new HashSet<String>();
|
|
StringBuffer sentence = new StringBuffer(100);
|
|
String word = "";
|
|
String k;
|
|
int wordlen;
|
|
wordStatProp wsp, wsp1;
|
|
phraseStatProp psp;
|
|
int wordHandle;
|
|
int wordHandleCount = 0;
|
|
int sentenceHandleCount = 0;
|
|
int allwordcounter = 0;
|
|
int allsentencecounter = 0;
|
|
int idx;
|
|
int wordInSentenceCounter = 1;
|
|
boolean comb_indexof = false, last_last = false, last_index = false;
|
|
RandomAccessFile fa;
|
|
final boolean dumpWords = false;
|
|
|
|
if (dumpWords) try {
|
|
fa = new RandomAccessFile(new File("dump.txt"), "rw");
|
|
fa.seek(fa.length());
|
|
} catch (IOException e) {
|
|
e.printStackTrace();
|
|
fa = null;
|
|
}
|
|
|
|
// read source
|
|
sievedWordsEnum wordenum = new sievedWordsEnum(is, charset, wordminsize);
|
|
while (wordenum.hasMoreElements()) {
|
|
word = (new String((StringBuffer) wordenum.nextElement())).toLowerCase(); // TODO: does toLowerCase work for non ISO-8859-1 chars?
|
|
//System.out.println("PARSED-WORD " + word);
|
|
|
|
//This is useful for testing what YaCy "sees" of a website.
|
|
if (dumpWords && fa != null) try {
|
|
fa.writeBytes(word);
|
|
fa.write(160);
|
|
} catch (IOException e) {
|
|
e.printStackTrace();
|
|
}
|
|
|
|
// distinguish punctuation and words
|
|
wordlen = word.length();
|
|
Iterator<String> it;
|
|
if ((wordlen == 1) && (htmlFilterContentScraper.punctuation(word.charAt(0)))) {
|
|
// store sentence
|
|
if (sentence.length() > 0) {
|
|
// we store the punctuation symbol as first element of the sentence vector
|
|
allsentencecounter++;
|
|
sentence.insert(0, word); // append at beginning
|
|
if (sentences.containsKey(sentence)) {
|
|
// sentence already exists
|
|
psp = (phraseStatProp) sentences.get(sentence);
|
|
psp.inc();
|
|
idx = psp.handle;
|
|
sentences.put(sentence, psp);
|
|
} else {
|
|
// create new sentence
|
|
idx = sentenceHandleCount++;
|
|
sentences.put(sentence, new phraseStatProp(idx));
|
|
}
|
|
// store to the words a link to this sentence
|
|
it = currsentwords.iterator();
|
|
while (it.hasNext()) {
|
|
k = (String) it.next();
|
|
wsp = (wordStatProp) words.get(k);
|
|
wsp.check(idx);
|
|
words.put(k, wsp);
|
|
}
|
|
}
|
|
sentence = new StringBuffer(100);
|
|
currsentwords.clear();
|
|
wordInSentenceCounter = 1;
|
|
} else {
|
|
// check index.of detection
|
|
if ((last_last) && (comb_indexof) && (word.equals("modified"))) {
|
|
this.RESULT_FLAGS.set(flag_cat_indexof, true);
|
|
wordenum.pre(true); // parse lines as they come with CRLF
|
|
}
|
|
if ((last_index) && (word.equals("of"))) comb_indexof = true;
|
|
last_last = word.equals("last");
|
|
last_index = word.equals("index");
|
|
|
|
// store word
|
|
allwordcounter++;
|
|
currsentwords.add(word);
|
|
if (words.containsKey(word)) {
|
|
// word already exists
|
|
wsp = (wordStatProp) words.get(word);
|
|
wordHandle = wsp.posInText;
|
|
wsp.inc();
|
|
} else {
|
|
// word does not yet exist, create new word entry
|
|
wordHandle = wordHandleCount++;
|
|
wsp = new wordStatProp(wordHandle, wordInSentenceCounter, sentences.size() + 100);
|
|
wsp.flags = (kelondroBitfield) RESULT_FLAGS.clone();
|
|
}
|
|
words.put(word, wsp);
|
|
// we now have the unique handle of the word, put it into the sentence:
|
|
sentence.append(intString(wordHandle, numlength));
|
|
wordInSentenceCounter++;
|
|
}
|
|
}
|
|
// finish last sentence
|
|
if (sentence.length() > 0) {
|
|
allsentencecounter++;
|
|
sentence.insert(0, "."); // append at beginning
|
|
if (sentences.containsKey(sentence)) {
|
|
psp = (phraseStatProp) sentences.get(sentence);
|
|
psp.inc();
|
|
sentences.put(sentence, psp);
|
|
} else {
|
|
sentences.put(sentence, new phraseStatProp(sentenceHandleCount++));
|
|
}
|
|
}
|
|
|
|
if (dumpWords && fa != null) try {
|
|
fa.write('\n');
|
|
fa.close();
|
|
} catch (IOException e) {
|
|
e.printStackTrace();
|
|
}
|
|
|
|
// -------------------
|
|
|
|
// we reconstruct the sentence hashtable
|
|
// and order the entries by the number of the sentence
|
|
// this structure is needed to replace double occurring words in sentences
|
|
Object[] orderedSentences = new Object[sentenceHandleCount];
|
|
String[] s;
|
|
int wc;
|
|
Object o;
|
|
Iterator<StringBuffer> sit = sentences.keySet().iterator();
|
|
while (sit.hasNext()) {
|
|
o = sit.next();
|
|
if (o != null) {
|
|
sentence = (StringBuffer) o;
|
|
wc = (sentence.length() - 1) / numlength;
|
|
s = new String[wc + 2];
|
|
psp = (phraseStatProp) sentences.get(sentence);
|
|
s[0] = intString(psp.count, numlength); // number of occurrences of this sentence
|
|
s[1] = sentence.substring(0, 1); // the termination symbol of this sentence
|
|
for (int i = 0; i < wc; i++) {
|
|
k = sentence.substring(i * numlength + 1, (i + 1) * numlength + 1);
|
|
s[i + 2] = k;
|
|
}
|
|
orderedSentences[psp.handle] = s;
|
|
}
|
|
}
|
|
|
|
Map.Entry<String, wordStatProp> entry;
|
|
// we search for similar words and reorganize the corresponding sentences
|
|
// a word is similar, if a shortened version is equal
|
|
Iterator<Map.Entry<String, wordStatProp>> wi = words.entrySet().iterator(); // enumerates the keys in descending order
|
|
wordsearch: while (wi.hasNext()) {
|
|
entry = wi.next();
|
|
word = entry.getKey();
|
|
wordlen = word.length();
|
|
wsp = entry.getValue();
|
|
for (int i = wordcut; i > 0; i--) {
|
|
if (wordlen > i) {
|
|
k = word.substring(0, wordlen - i);
|
|
if (words.containsKey(k)) {
|
|
// we will delete the word 'word' and repoint the
|
|
// corresponding links
|
|
// in sentences that use this word
|
|
wsp1 = (wordStatProp) words.get(k);
|
|
Iterator<Integer> it1 = wsp.hash.iterator(); // we iterate over all sentences that refer to this word
|
|
while (it1.hasNext()) {
|
|
idx = it1.next().intValue(); // number of a sentence
|
|
s = (String[]) orderedSentences[idx];
|
|
for (int j = 2; j < s.length; j++) {
|
|
if (s[j].equals(intString(wsp.posInText, numlength)))
|
|
s[j] = intString(wsp1.posInText, numlength);
|
|
}
|
|
orderedSentences[idx] = s;
|
|
}
|
|
// update word counter
|
|
wsp1.count = wsp1.count + wsp.count;
|
|
words.put(k, wsp1);
|
|
// remove current word
|
|
wi.remove();
|
|
continue wordsearch;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// depending on the orderedSentences structure, we rebuild the sentence
|
|
// HashMap to eliminate double occurring sentences
|
|
sentences = new HashMap<StringBuffer, phraseStatProp>();
|
|
int le;
|
|
for (int i = 0; i < orderedSentences.length; i++) {
|
|
le = ((String[]) orderedSentences[i]).length;
|
|
sentence = new StringBuffer(le * 10);
|
|
for (int j = 1; j < le; j++)
|
|
sentence.append(((String[]) orderedSentences[i])[j]);
|
|
if (sentences.containsKey(sentence)) {
|
|
// add sentence counter to counter of found sentence
|
|
psp = sentences.get(sentence);
|
|
psp.count = psp.count + Integer.parseInt(((String[]) orderedSentences[i])[0]);
|
|
sentences.put(sentence, psp);
|
|
// System.out.println("Found double occurring sentence " + i + "
|
|
// = " + sp.handle);
|
|
} else {
|
|
// create new sentence entry
|
|
psp = new phraseStatProp(i);
|
|
psp.count = Integer.parseInt(((String[]) orderedSentences[i])[0]);
|
|
sentences.put(sentence, psp);
|
|
}
|
|
}
|
|
|
|
// store result
|
|
//this.RESULT_NUMB_TEXT_BYTES = wordenum.count();
|
|
this.RESULT_NUMB_WORDS = allwordcounter;
|
|
this.RESULT_DIFF_WORDS = wordHandleCount;
|
|
this.RESULT_NUMB_SENTENCES = allsentencecounter;
|
|
this.RESULT_DIFF_SENTENCES = sentenceHandleCount;
|
|
}
|
|
|
|
public void print() {
|
|
String[] s = sentenceReconstruction();
|
|
|
|
// printout a reconstruction of the text
|
|
for (int i = 0; i < s.length; i++) {
|
|
if (s[i] != null) System.out.print("#T " + intString(i, numlength) + " " + s[i]);
|
|
}
|
|
}
|
|
|
|
private String[] sentenceReconstruction() {
|
|
// we reconstruct the word hashtable
|
|
// and order the entries by the number of the sentence
|
|
// this structure is only needed to reconstruct the text
|
|
String word;
|
|
wordStatProp wsp;
|
|
Map.Entry<String, wordStatProp> entry;
|
|
Iterator<Map.Entry<String, wordStatProp>> it;
|
|
String[] orderedWords = new String[words.size() + 99]; // uuiiii, the '99' is only a quick hack...
|
|
it = words.entrySet().iterator(); // enumerates the keys in ascending order
|
|
while (it.hasNext()) {
|
|
entry = it.next();
|
|
word = entry.getKey();
|
|
wsp = entry.getValue();
|
|
orderedWords[wsp.posInText] = word;
|
|
}
|
|
|
|
Object[] orderedSentences = makeOrderedSentences();
|
|
|
|
// create a reconstruction of the text
|
|
String[] result = new String[orderedSentences.length];
|
|
String s;
|
|
for (int i = 0; i < orderedSentences.length; i++) {
|
|
if (orderedSentences[i] != null) {
|
|
// TODO: bugfix for UTF-8: avoid this form of string concatenation
|
|
s = "";
|
|
for (int j = 2; j < ((String[]) orderedSentences[i]).length; j++) {
|
|
s += " " + orderedWords[Integer.parseInt(((String[]) orderedSentences[i])[j])];
|
|
}
|
|
s += ((String[]) orderedSentences[i])[1];
|
|
result[i] = (s.length() > 1) ? s.substring(1) : s;
|
|
} else {
|
|
result[i] = "";
|
|
}
|
|
}
|
|
return result;
|
|
}
|
|
|
|
private Object[] makeOrderedSentences() {
|
|
// we reconstruct the sentence hashtable again and create by-handle ordered entries
|
|
// this structure is needed to present the strings in the right order in a printout
|
|
int wc;
|
|
phraseStatProp psp;
|
|
String[] s;
|
|
StringBuffer sentence;
|
|
Object[] orderedSentences = new Object[sentences.size()];
|
|
for (int i = 0; i < sentences.size(); i++) {
|
|
orderedSentences[i] = null; // this array must be initialized
|
|
}
|
|
Iterator<StringBuffer> it = sentences.keySet().iterator();
|
|
while (it.hasNext()) {
|
|
sentence = (StringBuffer) it.next();
|
|
wc = (sentence.length() - 1) / numlength;
|
|
s = new String[wc + 2];
|
|
psp = (phraseStatProp) sentences.get(sentence);
|
|
s[0] = intString(psp.count, numlength); // number of occurrences of this sentence
|
|
s[1] = sentence.substring(0, 1); // the termination symbol of this sentence
|
|
for (int i = 0; i < wc; i++)
|
|
s[i + 2] = sentence.substring(i * numlength + 1, (i + 1) * numlength + 1);
|
|
orderedSentences[psp.handle] = s;
|
|
}
|
|
return orderedSentences;
|
|
}
|
|
|
|
public final static boolean invisible(char c) {
|
|
// TODO: Bugfix for UTF-8: does this work for non ISO-8859-1 chars?
|
|
if ((c < ' ') || (c > 'z')) return true;
|
|
return ("$%&/()=\"$%&/()=`^+*~#'-_:;,|<>[]\\".indexOf(c) >= 0);
|
|
}
|
|
|
|
public static Enumeration<StringBuffer> wordTokenizer(String s, String charset, int minLength) {
|
|
try {
|
|
return new sievedWordsEnum(new ByteArrayInputStream(s.getBytes()), charset, minLength);
|
|
} catch (Exception e) {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
public static class sievedWordsEnum implements Enumeration<StringBuffer> {
|
|
// this enumeration removes all words that contain either wrong characters or are too short
|
|
|
|
StringBuffer buffer = null;
|
|
unsievedWordsEnum e;
|
|
int ml;
|
|
|
|
public sievedWordsEnum(InputStream is, String charset, int minLength) throws UnsupportedEncodingException {
|
|
e = new unsievedWordsEnum(is, charset);
|
|
buffer = nextElement0();
|
|
ml = minLength;
|
|
}
|
|
|
|
public void pre(boolean x) {
|
|
e.pre(x);
|
|
}
|
|
|
|
private StringBuffer nextElement0() {
|
|
StringBuffer s;
|
|
char c;
|
|
loop: while (e.hasMoreElements()) {
|
|
s = (StringBuffer) e.nextElement();
|
|
if ((s.length() == 1) && (htmlFilterContentScraper.punctuation(s.charAt(0)))) return s;
|
|
if ((s.length() < ml) && (!(s.equals("of")))) continue loop;
|
|
for (int i = 0; i < s.length(); i++) {
|
|
c = s.charAt(i);
|
|
// TODO: Bugfix needed for UTF-8
|
|
if (((c < 'a') || (c > 'z')) &&
|
|
((c < 'A') || (c > 'Z')) &&
|
|
((c < '0') || (c > '9')))
|
|
continue loop; // go to next while loop
|
|
}
|
|
return s;
|
|
}
|
|
return null;
|
|
}
|
|
|
|
public boolean hasMoreElements() {
|
|
return buffer != null;
|
|
}
|
|
|
|
public StringBuffer nextElement() {
|
|
StringBuffer r = buffer;
|
|
buffer = nextElement0();
|
|
return r;
|
|
}
|
|
|
|
}
|
|
|
|
private static class unsievedWordsEnum implements Enumeration<StringBuffer> {
|
|
// returns an enumeration of StringBuffer Objects
|
|
StringBuffer buffer = null;
|
|
sentencesFromInputStreamEnum e;
|
|
StringBuffer s;
|
|
|
|
public unsievedWordsEnum(InputStream is, String charset) throws UnsupportedEncodingException {
|
|
e = new sentencesFromInputStreamEnum(is, charset);
|
|
s = new StringBuffer();
|
|
buffer = nextElement0();
|
|
}
|
|
|
|
public void pre(boolean x) {
|
|
e.pre(x);
|
|
}
|
|
|
|
private StringBuffer nextElement0() {
|
|
StringBuffer r;
|
|
StringBuffer sb;
|
|
char c;
|
|
while (s.length() == 0) {
|
|
if (e.hasNext()) {
|
|
r = (StringBuffer) e.next();
|
|
if (r == null) return null;
|
|
r = trim(r);
|
|
sb = new StringBuffer(r.length() * 2);
|
|
for (int i = 0; i < r.length(); i++) {
|
|
c = r.charAt(i);
|
|
if (invisible(c)) sb = sb.append(' '); // TODO: Bugfix needed for UTF-8
|
|
else if (htmlFilterContentScraper.punctuation(c)) sb = sb.append(' ').append(c).append(' ');
|
|
else sb = sb.append(c);
|
|
}
|
|
s = trim(sb);
|
|
//System.out.println("PARSING-LINE '" + r + "'->'" + s + "'");
|
|
} else {
|
|
return null;
|
|
}
|
|
}
|
|
int p = s.indexOf(" ");
|
|
if (p < 0) {
|
|
r = s;
|
|
s = new StringBuffer();
|
|
return r;
|
|
}
|
|
r = trim(new StringBuffer(s.substring(0, p)));
|
|
s = trim(s.delete(0, p + 1));
|
|
return r;
|
|
}
|
|
|
|
public boolean hasMoreElements() {
|
|
return buffer != null;
|
|
}
|
|
|
|
public StringBuffer nextElement() {
|
|
StringBuffer r = buffer;
|
|
buffer = nextElement0();
|
|
return r;
|
|
}
|
|
|
|
}
|
|
|
|
public static StringBuffer trim(StringBuffer sb) {
|
|
synchronized (sb) {
|
|
while ((sb.length() > 0) && (sb.charAt(0) <= ' ')) sb = sb.deleteCharAt(0);
|
|
while ((sb.length() > 0) && (sb.charAt(sb.length() - 1) <= ' ')) sb = sb.deleteCharAt(sb.length() - 1);
|
|
}
|
|
return sb;
|
|
}
|
|
|
|
public static sentencesFromInputStreamEnum sentencesFromInputStream(InputStream is, String charset) {
|
|
try {
|
|
return new sentencesFromInputStreamEnum(is, charset);
|
|
} catch (UnsupportedEncodingException e) {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
public static class sentencesFromInputStreamEnum implements Iterator<StringBuffer> {
|
|
// read sentences from a given input stream
|
|
// this enumerates StringBuffer objects
|
|
|
|
StringBuffer buffer = null;
|
|
BufferedReader raf;
|
|
int counter = 0;
|
|
boolean pre = false;
|
|
|
|
public sentencesFromInputStreamEnum(InputStream is, String charset) throws UnsupportedEncodingException {
|
|
raf = new BufferedReader((charset == null) ? new InputStreamReader(is) : new InputStreamReader(is, charset));
|
|
buffer = nextElement0();
|
|
counter = 0;
|
|
pre = false;
|
|
}
|
|
|
|
public void pre(boolean x) {
|
|
this.pre = x;
|
|
}
|
|
|
|
private StringBuffer nextElement0() {
|
|
try {
|
|
StringBuffer s = readSentence(raf, pre);
|
|
//System.out.println(" SENTENCE='" + s + "'"); // DEBUG
|
|
if (s == null) {
|
|
raf.close();
|
|
return null;
|
|
}
|
|
return s;
|
|
} catch (IOException e) {
|
|
try {
|
|
raf.close();
|
|
} catch (Exception ee) {
|
|
}
|
|
return null;
|
|
}
|
|
}
|
|
|
|
public boolean hasNext() {
|
|
return buffer != null;
|
|
}
|
|
|
|
public StringBuffer next() {
|
|
if (buffer == null) {
|
|
return null;
|
|
} else {
|
|
counter = counter + buffer.length() + 1;
|
|
StringBuffer r = buffer;
|
|
buffer = nextElement0();
|
|
return r;
|
|
}
|
|
}
|
|
|
|
public int count() {
|
|
return counter;
|
|
}
|
|
|
|
public void remove() {
|
|
throw new UnsupportedOperationException();
|
|
}
|
|
}
|
|
|
|
static StringBuffer readSentence(Reader reader, boolean pre) throws IOException {
|
|
StringBuffer s = new StringBuffer();
|
|
int nextChar;
|
|
char c;
|
|
|
|
// find sentence end
|
|
for (;;) {
|
|
nextChar = reader.read();
|
|
//System.out.print((char) nextChar); // DEBUG
|
|
if (nextChar < 0) {
|
|
if (s.length() == 0) return null; else break;
|
|
}
|
|
c = (char) nextChar;
|
|
s.append(c);
|
|
if (pre) {
|
|
if ((c == (char) 10) || (c == (char) 13)) break;
|
|
} else {
|
|
if (htmlFilterContentScraper.punctuation(c)) break;
|
|
}
|
|
}
|
|
|
|
// replace line endings and tabs by blanks
|
|
for (int i = 0; i < s.length(); i++) {
|
|
if ((s.charAt(i) == (char) 10) || (s.charAt(i) == (char) 13) || (s.charAt(i) == (char) 8)) s.setCharAt(i, ' ');
|
|
}
|
|
// remove all double-spaces
|
|
int p; while ((p = s.indexOf(" ")) >= 0) s.deleteCharAt(p);
|
|
return s;
|
|
}
|
|
|
|
public static Map<String, wordStatProp> getWords(byte[] text, String charset) throws UnsupportedEncodingException {
|
|
// returns a word/wordStatProp relation map
|
|
if (text == null) return null;
|
|
ByteArrayInputStream buffer = new ByteArrayInputStream(text);
|
|
return new plasmaCondenser(buffer, charset, 2, 1).words();
|
|
}
|
|
|
|
public static Map<String, wordStatProp> getWords(String text) {
|
|
// returns a word/wordStatProp relation map
|
|
if (text == null) return null;
|
|
ByteArrayInputStream buffer = new ByteArrayInputStream(text.getBytes());
|
|
try {
|
|
return new plasmaCondenser(buffer, "UTF-8", 2, 1).words();
|
|
} catch (UnsupportedEncodingException e) {
|
|
return null;
|
|
}
|
|
}
|
|
|
|
public static void main(String[] args) {
|
|
// read a property file and convert them into configuration lines
|
|
try {
|
|
File f = new File(args[0]);
|
|
Properties p = new Properties();
|
|
p.load(new FileInputStream(f));
|
|
StringBuffer sb = new StringBuffer();
|
|
sb.append("{\n");
|
|
for (int i = 0; i <= 15; i++) {
|
|
sb.append('"');
|
|
String s = p.getProperty("keywords" + i);
|
|
String[] l = s.split(",");
|
|
for (int j = 0; j < l.length; j++) {
|
|
sb.append(word2hash(l[j]));
|
|
}
|
|
if (i < 15) sb.append(",\n");
|
|
}
|
|
sb.append("}\n");
|
|
System.out.println(new String(sb));
|
|
} catch (FileNotFoundException e) {
|
|
e.printStackTrace();
|
|
} catch (IOException e) {
|
|
e.printStackTrace();
|
|
}
|
|
|
|
}
|
|
|
|
}
|