Support parsing audio URLs without file extension

Added also a Junit for the audio tag parser
pull/292/head
luccioman 6 years ago
parent 42c8a251c8
commit e90405b6f0

@ -63,9 +63,9 @@ import net.yacy.kelondro.util.FileUtils;
public class audioTagParser extends AbstractParser implements Parser { public class audioTagParser extends AbstractParser implements Parser {
/** /**
* Enumeration of internet media types supported by the {@link audioTagParser}. * Enumeration of audio formats supported by the {@link audioTagParser}.
*/ */
public enum SupportedAudioMediaType { public enum SupportedAudioFormat {
AIF("audio/aiff", new String[] { "audio/x-aiff" }, new String[] { SupportedFileFormat.AIF.getFilesuffix(), AIF("audio/aiff", new String[] { "audio/x-aiff" }, new String[] { SupportedFileFormat.AIF.getFilesuffix(),
SupportedFileFormat.AIFC.getFilesuffix(), SupportedFileFormat.AIFF.getFilesuffix() }), SupportedFileFormat.AIFC.getFilesuffix(), SupportedFileFormat.AIFF.getFilesuffix() }),
@ -118,7 +118,7 @@ public class audioTagParser extends AbstractParser implements Parser {
* @param mediaType the media type, formatted as "type/subtype" * @param mediaType the media type, formatted as "type/subtype"
* @param fileExtensions a set of file extensions matching the given media type * @param fileExtensions a set of file extensions matching the given media type
*/ */
private SupportedAudioMediaType(final String mediaType, final String[] fileExtensions) { private SupportedAudioFormat(final String mediaType, final String[] fileExtensions) {
this(mediaType, new String[] {}, fileExtensions); this(mediaType, new String[] {}, fileExtensions);
} }
@ -127,7 +127,7 @@ public class audioTagParser extends AbstractParser implements Parser {
* @param alternateMediaTypes alternate flavors the the main media type, all formatted as "type/subtype" * @param alternateMediaTypes alternate flavors the the main media type, all formatted as "type/subtype"
* @param fileExtensions a set of file extensions matching the given media type * @param fileExtensions a set of file extensions matching the given media type
*/ */
private SupportedAudioMediaType(final String mediaType, final String[] alternateMediaTypes, final String[] fileExtensions) { private SupportedAudioFormat(final String mediaType, final String[] alternateMediaTypes, final String[] fileExtensions) {
this.mediaType = mediaType.toLowerCase(Locale.ROOT); this.mediaType = mediaType.toLowerCase(Locale.ROOT);
Set<String> alternates = new HashSet<>(); Set<String> alternates = new HashSet<>();
for (final String alternateMediaType : alternateMediaTypes) { for (final String alternateMediaType : alternateMediaTypes) {
@ -179,7 +179,7 @@ public class audioTagParser extends AbstractParser implements Parser {
*/ */
public static Set<String> getAllMediaTypes() { public static Set<String> getAllMediaTypes() {
final Set<String> mediaTypes = new HashSet<>(); final Set<String> mediaTypes = new HashSet<>();
for(final SupportedAudioMediaType mediaType : SupportedAudioMediaType.values()) { for(final SupportedAudioFormat mediaType : SupportedAudioFormat.values()) {
mediaTypes.add(mediaType.getMediaType()); mediaTypes.add(mediaType.getMediaType());
for(final String mediaTypeString : mediaType.getAlternateMediaTypes()) { for(final String mediaTypeString : mediaType.getAlternateMediaTypes()) {
mediaTypes.add(mediaTypeString); mediaTypes.add(mediaTypeString);
@ -193,15 +193,18 @@ public class audioTagParser extends AbstractParser implements Parser {
*/ */
public static Set<String> getAllFileExtensions() { public static Set<String> getAllFileExtensions() {
final Set<String> extensions = new HashSet<>(); final Set<String> extensions = new HashSet<>();
for(final SupportedAudioMediaType mediaType : SupportedAudioMediaType.values()) { for(final SupportedAudioFormat mediaType : SupportedAudioFormat.values()) {
extensions.addAll(mediaType.getFileExtensions()); extensions.addAll(mediaType.getFileExtensions());
} }
return extensions; return extensions;
} }
} }
/** Map from each supported audio file extensions to a single audio media type */ /** Map from each supported audio file extensions to audio format */
private final Map<String, SupportedAudioMediaType> ext2NormalMediaType; private final Map<String, SupportedAudioFormat> ext2Format;
/** Map from each supported audio media type to audio format */
private final Map<String, SupportedAudioFormat> mediaType2Format;
/** Space character */ /** Space character */
private static final char SPACE_CHAR = ' '; private static final char SPACE_CHAR = ' ';
@ -217,18 +220,25 @@ public class audioTagParser extends AbstractParser implements Parser {
public audioTagParser() { public audioTagParser() {
super("Audio File Meta-Tag Parser"); super("Audio File Meta-Tag Parser");
final Map<String, SupportedAudioMediaType> normalMap = new HashMap<>(); final Map<String, SupportedAudioFormat> ext2Formats = new HashMap<>();
final Map<String, SupportedAudioFormat> mediaType2Formats = new HashMap<>();
for(final SupportedAudioMediaType mediaType : SupportedAudioMediaType.values()) { for(final SupportedAudioFormat mediaType : SupportedAudioFormat.values()) {
this.SUPPORTED_MIME_TYPES.add(mediaType.getMediaType()); this.SUPPORTED_MIME_TYPES.add(mediaType.getMediaType());
this.SUPPORTED_MIME_TYPES.addAll(mediaType.getAlternateMediaTypes()); this.SUPPORTED_MIME_TYPES.addAll(mediaType.getAlternateMediaTypes());
this.SUPPORTED_EXTENSIONS.addAll(mediaType.getFileExtensions()); this.SUPPORTED_EXTENSIONS.addAll(mediaType.getFileExtensions());
for(final String fileExtension : mediaType.getFileExtensions()) { for(final String fileExtension : mediaType.getFileExtensions()) {
normalMap.put(fileExtension, mediaType); ext2Formats.put(fileExtension, mediaType);
}
mediaType2Formats.put(mediaType.getMediaType(), mediaType);
for(final String mediaTypeStr : mediaType.getAlternateMediaTypes()) {
mediaType2Formats.put(mediaTypeStr, mediaType);
} }
} }
this.ext2NormalMediaType = Collections.unmodifiableMap(normalMap); this.ext2Format = Collections.unmodifiableMap(ext2Formats);
this.mediaType2Format = Collections.unmodifiableMap(mediaType2Formats);
} }
@Override @Override
@ -246,9 +256,34 @@ public class audioTagParser extends AbstractParser implements Parser {
@Override @Override
public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper, public Document[] parseWithLimits(final DigestURL location, final String mimeType, final String charset, final VocabularyScraper scraper,
final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes) final int timezoneOffset, final InputStream source, final int maxLinks, final long maxBytes)
throws UnsupportedOperationException, Failure, InterruptedException { throws Failure, InterruptedException {
String filename = location.getFileName(); String filename = location.getFileName();
final String fileext = MultiProtocolURL.getFileExtension(filename); String fileExt = MultiProtocolURL.getFileExtension(filename);
SupportedAudioFormat audioFormat = null;
if(fileExt != null) {
audioFormat = this.ext2Format.get(fileExt);
}
if(audioFormat == null) {
audioFormat = this.mediaType2Format.get(mimeType);
}
String normalizedMediaType = mimeType;
if(audioFormat != null) {
/* normalize to a single Media Type. Advantages :
* - index document with the right media type when HTTP response header "Content-Type" is missing or has a wrong value
* - for easier search by CollectionSchema.content_type in the index
*/
normalizedMediaType = audioFormat.getMediaType();
if(fileExt.isEmpty() || !ext2Format.containsKey(fileExt)) {
/* Normalize extension to a one known by jaudiotagger */
fileExt = audioFormat.getFileExtensions().iterator().next();
}
}
filename = filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename); filename = filename.isEmpty() ? location.toTokens() : MultiProtocolURL.unescape(filename);
File tempFile = null; File tempFile = null;
@ -260,7 +295,7 @@ public class audioTagParser extends AbstractParser implements Parser {
f = AudioFileIO.read(location.getFSFile()); f = AudioFileIO.read(location.getFSFile());
} else { } else {
// create a temporary file, as jaudiotagger requires a file rather than an input stream // create a temporary file, as jaudiotagger requires a file rather than an input stream
tempFile = File.createTempFile(filename, "." + fileext); tempFile = File.createTempFile(filename, "." + fileExt);
long bytesCopied = FileUtils.copy(source, tempFile, maxBytes); long bytesCopied = FileUtils.copy(source, tempFile, maxBytes);
partiallyParsed = bytesCopied == maxBytes && source.read() != -1; partiallyParsed = bytesCopied == maxBytes && source.read() != -1;
f = AudioFileIO.read(tempFile); f = AudioFileIO.read(tempFile);
@ -316,21 +351,9 @@ public class audioTagParser extends AbstractParser implements Parser {
detectedUrls = Collections.emptySet(); detectedUrls = Collections.emptySet();
} }
/* normalize to a single Media Type. Advantages :
* - index document with the right media type when HTTP response header "Content-Type" is missing or has a wrong value
* - for easier search by CollectionSchema.content_type in the index
*/
String mime = mimeType;
if(fileext != null && !fileext.isEmpty() ) {
final SupportedAudioMediaType mediaType = this.ext2NormalMediaType.get(fileext);
if(mediaType != null) {
mime = mediaType.getMediaType();
}
}
final Document doc = new Document( final Document doc = new Document(
location, location,
mime, normalizedMediaType,
charset, charset,
this, this,
lang, // languages lang, // languages

@ -441,11 +441,11 @@ public class migration {
* All old audio file extensions and media types are denied : we add newly * All old audio file extensions and media types are denied : we add newly
* supported ones to theses deny lists * supported ones to theses deny lists
*/ */
deniedExtensions.addAll(audioTagParser.SupportedAudioMediaType.getAllFileExtensions()); deniedExtensions.addAll(audioTagParser.SupportedAudioFormat.getAllFileExtensions());
sb.setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, deniedExtensions); sb.setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, deniedExtensions);
deniedMediaTypes.addAll(audioTagParser.SupportedAudioMediaType.getAllMediaTypes()); deniedMediaTypes.addAll(audioTagParser.SupportedAudioFormat.getAllMediaTypes());
sb.setConfig(SwitchboardConstants.PARSER_MIME_DENY, deniedMediaTypes); sb.setConfig(SwitchboardConstants.PARSER_MIME_DENY, deniedMediaTypes);

@ -887,8 +887,8 @@ public final class Switchboard extends serverSwitch {
/* audioTagParser is disabled by default as it needs a temporary file (because of the JAudiotagger implementation) for each parsed document */ /* audioTagParser is disabled by default as it needs a temporary file (because of the JAudiotagger implementation) for each parsed document */
if (!enableAudioTags) { if (!enableAudioTags) {
denyExt.addAll(audioTagParser.SupportedAudioMediaType.getAllFileExtensions()); denyExt.addAll(audioTagParser.SupportedAudioFormat.getAllFileExtensions());
denyMime.addAll(audioTagParser.SupportedAudioMediaType.getAllMediaTypes()); denyMime.addAll(audioTagParser.SupportedAudioFormat.getAllMediaTypes());
setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, denyExt); setConfig(SwitchboardConstants.PARSER_EXTENSIONS_DENY, denyExt);
setConfig(SwitchboardConstants.PARSER_MIME_DENY, denyMime); setConfig(SwitchboardConstants.PARSER_MIME_DENY, denyMime);

@ -0,0 +1,142 @@
// audioTagParserTest.java
// ---------------------------
// Copyright 2019 by luccioman; https://github.com/luccioman
//
// This is a part of YaCy, a peer-to-peer based web search engine
//
// LICENSE
//
// This program is free software; you can redistribute it and/or modify
// it under the terms of the GNU General Public License as published by
// the Free Software Foundation; either version 2 of the License, or
// (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU General Public License for more details.
//
// You should have received a copy of the GNU General Public License
// along with this program; if not, write to the Free Software
// Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
package net.yacy.document.parser;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNotNull;
import static org.junit.Assert.assertTrue;
import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.nio.charset.StandardCharsets;
import java.util.Collection;
import org.junit.Before;
import org.junit.Test;
import net.yacy.cora.document.id.AnchorURL;
import net.yacy.cora.document.id.DigestURL;
import net.yacy.document.Document;
import net.yacy.document.Parser.Failure;
import net.yacy.document.VocabularyScraper;
/**
* Unit tests for the {@link audioTagParser} class
*
*/
public class audioTagParserTest {
/** Folder containing test files */
private static final File TEST_FOLDER = new File("test", "parsertest");
/** The parser under test */
private audioTagParser parser;
@Before
public void before() {
this.parser = new audioTagParser();
}
/**
* Unit test for the
* {@link audioTagParser#parse(DigestURL, String, String, VocabularyScraper, int, java.io.InputStream)}
* function with some small (1 second length) test files.
*
* @throws Failure when a file could not be parsed
* @throws InterruptedException when the test was interrupted before its
* termination
* @throws IOException when a read/write error occurred
*/
@Test
public void testParse() throws Failure, InterruptedException, IOException {
final String[] fileNames = { "umlaute_windows.aiff", "umlaute_windows.flac", "umlaute_windows.m4a",
"umlaute_windows.mp3", "umlaute_windows.ogg", "umlaute_windows.wav" };
for (final String fileName : fileNames) {
final DigestURL location = new DigestURL("http://localhost/" + fileName);
try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLDER, fileName));) {
final Document[] documents = this.parser.parse(location, "audio/ogg", StandardCharsets.UTF_8.name(),
new VocabularyScraper(), 0, inStream);
assertNotNull("Parser result must not be null for file " + fileName, documents);
assertNotNull("Parsed text must not be empty for file " + fileName, documents[0].getTextString());
assertTrue("Parsed text must contain test word with umlaut char" + fileName,
documents[0].getTextString().contains("Maßkrügen"));
final Collection<AnchorURL> anchors = documents[0].getAnchors();
assertNotNull("Detected URLS must not be null for file " + fileName, anchors);
assertEquals("One URL must have been detected for file " + fileName, 1, anchors.size());
assertTrue(anchors.iterator().next().toString().equals("https://yacy.net/"));
}
}
}
/**
* Test support for parsing audio document with proper Media Type but without
* extension or unrelated extension in its file name.
*
* @throws Failure when the file could not be parsed
* @throws InterruptedException when the test was interrupted before its
* termination
* @throws IOException when a read/write error occurred
*/
@Test
public void testParseDocUrlWithoutFileExt() throws Failure, InterruptedException, IOException {
final String testFileName = "umlaute_windows.ogg";
final String[] locations = { "http://localhost/audioTrack", "http://localhost/example.audio" };
for (final String locationStr : locations) {
final DigestURL location = new DigestURL(locationStr);
try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLDER, testFileName));) {
final Document[] documents = this.parser.parse(location, "audio/ogg", StandardCharsets.UTF_8.name(),
new VocabularyScraper(), 0, inStream);
assertNotNull("Parser result must not be null for URL " + location, documents);
}
}
}
/**
* Test support for parsing audio document with unknown or generic Media Type
*
* @throws Failure when the file could not be parsed
* @throws InterruptedException when the test was interrupted before its
* termination
* @throws IOException when a read/write error occurred
*/
@Test
public void testParseUnkownMediaType() throws Failure, InterruptedException, IOException {
final String testFileName = "umlaute_windows.ogg";
final DigestURL location = new DigestURL("http://localhost/" + testFileName);
final String[] mediaTypes = { null, "application/octet-stream" };
for (final String mediaType : mediaTypes) {
try (final FileInputStream inStream = new FileInputStream(new File(TEST_FOLDER, testFileName));) {
final Document[] documents = this.parser.parse(location, mediaType, StandardCharsets.UTF_8.name(),
new VocabularyScraper(), 0, inStream);
assertNotNull("Parser result must not be null for Media Type " + mediaType, documents);
}
}
}
}
Loading…
Cancel
Save