*) storing document charset into plasmaParserDocument object (is needed later by the condenser)

*) htmlFilterContentScraper.java: using proper charset for document title
*) serverByteBuffer.java: adding new toString which allows to specify the charset for byte encoding


git-svn-id: https://svn.berlios.de/svnroot/repos/yacy/trunk@2593 6c8d7289-2bf4-0310-a012-ef5d649a1542
pull/1/head
theli 19 years ago
parent c5d3020941
commit 74c3e7cf29

@ -46,6 +46,7 @@ package de.anomic.htmlFilter;
import de.anomic.server.serverByteBuffer;
import de.anomic.net.URL;
import java.io.UnsupportedEncodingException;
import java.net.MalformedURLException;
import java.nio.charset.Charset;
import java.nio.charset.UnsupportedCharsetException;
@ -123,6 +124,10 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
this.charset = charset;
}
public String getCharset() {
return this.charset;
}
public void scrapeText(byte[] newtext) {
// System.out.println("SCRAPE: " + new String(newtext));
if ((content.length() != 0) && (content.byteAt(content.length() - 1) != 32)) content.append(32);
@ -243,8 +248,14 @@ public class htmlFilterContentScraper extends htmlFilterAbstractScraper implemen
if (s.length() > 0) return s;
// extract headline from content
if (content.length() > 80) return cleanLine(new String(content.getBytes(), 0, 80));
return cleanLine(content.trim().toString());
if (content.length() > 80) {
try {
return cleanLine(new String(content.getBytes(), 0, 80,this.charset));
} catch (UnsupportedEncodingException e) {
return cleanLine(new String(content.getBytes(), 0, 80));
}
}
return cleanLine(content.trim().toString(this.charset));
}
public String[] getHeadlines(int i) {

@ -89,6 +89,7 @@ implements Parser {
plasmaParserDocument theDoc = new plasmaParserDocument(
location,
mimeType,
"UTF-8",
null,
((contents.length() > 80)? contents.substring(0, 80):contents.trim()).
replaceAll("\r\n"," ").

@ -152,6 +152,7 @@ public class odtParser extends AbstractParser implements Parser {
return new plasmaParserDocument(
location,
mimeType,
"UTF-8",
docKeywords,
docShortTitle,
docLongTitle,

@ -152,6 +152,7 @@ public class pdfParser extends AbstractParser implements Parser {
plasmaParserDocument theDoc = new plasmaParserDocument(
location,
mimeType,
"UTF-8",
docKeyWords,
docSubject,
docTitle,

@ -147,6 +147,7 @@ public class rpmParser extends AbstractParser implements Parser {
plasmaParserDocument theDoc = new plasmaParserDocument(
location,
mimeType,
"UTF-8",
null,
name,
summary,

@ -190,6 +190,7 @@ public class rssParser extends AbstractParser implements Parser {
plasmaParserDocument theDoc = new plasmaParserDocument(
location,
mimeType,
"UTF-8",
null,
null,
feedTitle,

@ -95,6 +95,7 @@ implements Parser {
plasmaParserDocument theDoc = new plasmaParserDocument(
location,
mimeType,
"UTF-8",
null,
((bodyText.length() > 80)? bodyText.substring(0, 80):bodyText.trim()).
replaceAll("\r\n"," ").

@ -189,6 +189,7 @@ public class tarParser extends AbstractParser implements Parser {
return new plasmaParserDocument(
location,
mimeType,
null,
docKeywords.toString(),
docShortTitle.toString(),
docLongTitle.toString(),

@ -243,6 +243,7 @@ public class vcfParser extends AbstractParser implements Parser {
plasmaParserDocument theDoc = new plasmaParserDocument(
location, // url of the source document
mimeType, // the documents mime type
null,
null, // a list of extracted keywords
null, // a short document title
parsedTitle.toString(), // a long document title

@ -165,6 +165,7 @@ public class zipParser extends AbstractParser implements Parser {
return new plasmaParserDocument(
location,
mimeType,
null,
docKeywords.toString(),
docShortTitle.toString(),
docLongTitle.toString(),

@ -588,7 +588,7 @@ public final class plasmaParser {
int p = 0;
for (int i = 1; i <= 4; i++) for (int j = 0; j < scraper.getHeadlines(i).length; j++) sections[p++] = scraper.getHeadlines(i)[j];
plasmaParserDocument ppd = new plasmaParserDocument(new URL(location.toNormalform()),
mimeType, null, null, scraper.getTitle(),
mimeType, scraper.getCharset(), null, null, scraper.getTitle(),
sections, null,
scraper.getText(), scraper.getAnchors(), scraper.getImages());
//scraper.close();
@ -749,7 +749,12 @@ public final class plasmaParser {
if (document != null) {
// found text
String[] sentences = document.getSentences();
if (sentences != null) for (int i = 0; i < sentences.length; i++) System.out.println("line " + i + ":" + sentences[i]);
if (sentences != null) {
for (int i = 0; i < sentences.length; i++) {
System.out.print("line " + i + ": ");
System.out.println(sentences[i]);
}
}
// found links
int anchorNr = 0;

@ -56,6 +56,7 @@ public class plasmaParserDocument {
URL location; // the source url
String mimeType; // mimeType as taken from http header
String charset; // the charset of the document
String keywords; // most resources provide a keyword field
String shortTitle; // a shortTitle mostly appears in the window header (border)
String longTitle; // the real title of the document, commonly h1-tags
@ -73,12 +74,13 @@ public class plasmaParserDocument {
plasmaCondenser condenser;
boolean resorted;
public plasmaParserDocument(URL location, String mimeType,
public plasmaParserDocument(URL location, String mimeType, String charset,
String keywords, String shortTitle, String longTitle,
String[] sections, String abstrct,
byte[] text, Map anchors, TreeSet images) {
this.location = location;
this.mimeType = (mimeType==null)?"application/octet-stream":mimeType;
this.charset = charset;
this.keywords = (keywords==null)?"":keywords;
this.shortTitle = (shortTitle==null)?"":shortTitle;
this.longTitle = (longTitle==null)?"":longTitle;
@ -98,6 +100,13 @@ public class plasmaParserDocument {
return this.mimeType;
}
/**
* @return the supposed charset of this document or <code>null</code> if unknown
*/
public String getCharset() {
return this.charset;
}
public String getMainShortTitle() {
if (shortTitle != null) return shortTitle; else return longTitle;
}

@ -346,6 +346,14 @@ public final class serverByteBuffer extends OutputStream {
public String toString() {
return new String(buffer, offset, length);
}
public String toString(String charsetName) {
try {
return new String(this.getBytes(),charsetName);
} catch (UnsupportedEncodingException e) {
return new String(this.getBytes());
}
}
public String toString(int left, int rightbound) {
return new String(buffer, offset + left, rightbound - left);

Loading…
Cancel
Save