fix html tag attribute parsing containing attribute w/o value

e.g. itemscope or autofocus (in such case the next key was not properly
recognized).
pull/98/head
reger 8 years ago
parent cb95b7339a
commit 083df255e4

@ -396,12 +396,12 @@ public class ContentScraper extends AbstractScraper implements Scraper {
final String classprop = tag.opts.getProperty("class", EMPTY_STRING);
this.vocabularyScraper.check(this.root, classprop, tag.content);
// itemprop
// itemprop (schema.org)
String itemprop = tag.opts.getProperty("itemprop");
if (itemprop != null) {
String propval = tag.opts.getProperty("content");
if (propval == null) propval = tag.opts.getProperty("datetime"); // html5 example: <time itemprop="startDate" datetime="2016-01-26">today</time> while each prop is optional
if (propval != null) {
if (propval == null) propval = tag.opts.getProperty("datetime"); // html5 + schema.org#itemprop example: <time itemprop="startDate" datetime="2016-01-26">today</time> while each prop is optional
if (propval != null) { // html5 example: <time datetime="2016-01-26">today</time> while each prop is optional
if ("startDate".equals(itemprop)) try {
// parse ISO 8601 date
Date startDate = ISO8601Formatter.FORMATTER.parse(propval, this.timezoneOffset).getTime();
@ -646,7 +646,7 @@ public class ContentScraper extends AbstractScraper implements Scraper {
if ((href.length() > 0) && ((url = absolutePath(href)) != null)) {
if (followDenied()) {
String rel = tag.opts.getProperty("rel", EMPTY_STRING);
if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow";
if (rel.length() == 0) rel = "nofollow"; else if (rel.indexOf("nofollow") < 0) rel += ",nofollow";
tag.opts.put("rel", rel);
}
tag.opts.put("text", stripAllTags(tag.content.getChars())); // strip any inline html in tag text like "<a ...> <span>test</span> </a>"
@ -726,8 +726,8 @@ public class ContentScraper extends AbstractScraper implements Scraper {
h = cleanLine(CharacterCoding.html2unicode(stripAllTags(tag.content.getChars())));
if (h.length() > 0) this.articles.add(h);
} else if (tag.name.equalsIgnoreCase(TagName.time.name())) { // html5 tag <time datetime="2016-12-23">Event</time>
h = tag.opts.getProperty("datetime");
if (h != null) {
h = tag.opts.getProperty("datetime"); // TODO: checkOpts() also parses datetime property if in combination with schema.org itemprop=startDate/endDate
if (h != null) { // datetime property is optional
try {
Date startDate = ISO8601Formatter.FORMATTER.parse(h, this.timezoneOffset).getTime();
this.startDates.add(startDate);

@ -393,6 +393,12 @@ public final class CharBuffer extends Writer {
return new String(this.buffer, this.offset + left, rightbound - left);
}
/**
* Parses tag properties for key=value pairs.
* Single attributes w/o value (e.g. itemscope) are added as key with value empty String.
*
* @return
*/
public Properties propParser() {
// extract a=b or a="b" - relations from the buffer
int pos = this.offset;
@ -404,10 +410,13 @@ public final class CharBuffer extends Writer {
while (pos < this.length) {
// pos is at start of next key
start = pos;
while ((pos < this.length) && (this.buffer[pos] != equal)) pos++;
if (pos >= this.length) break; // this is the case if we found no equal
key = new String(this.buffer, start, pos - start).trim().toLowerCase();
// we have a key
while ((pos < this.length) && (this.buffer[pos] != equal && this.buffer[pos] > 32) ) pos++; // find = or whitespace
key = new String(this.buffer, start, pos - start).trim().toLowerCase(); // we have a key
while ((pos < this.length) && (this.buffer[pos] != equal && this.buffer[pos] <= 32)) pos++; // eat up whitespace until = or next char found
if (pos >= this.length || this.buffer[pos] != equal) { // no = found, this is the case for attributes w/o value
p.setProperty(key, "");
continue;
}
pos++;
// find start of value
while ((pos < this.length) && (this.buffer[pos] <= 32)) pos++;
@ -488,4 +497,4 @@ public final class CharBuffer extends Writer {
trimToSize();
}
}
}

@ -0,0 +1,41 @@
/**
* CharBufferTest
* part of YaCy
* Copyright 2016 by reger24; https://github.com/reger24
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with this program in the file lgpl21.txt
* If not, see <http://www.gnu.org/licenses/>.
*/
package net.yacy.kelondro.io;
import java.util.Properties;
import org.junit.Assert;
import org.junit.Test;
public class CharBufferTest {
/**
* Test of propParser method, of class CharBuffer.
*/
@Test
public void testPropParser() {
CharBuffer cb = new CharBuffer(100);
// test attribute w/o value
cb.append("class=\"company-name\" itemscope itemtype=\"https://schema.org/Organization\"");
Properties p = cb.propParser();
Assert.assertNotNull(p.get("class"));
Assert.assertNotNull(p.get("itemtype"));
}
}
Loading…
Cancel
Save