diff options
| author | Niels van Velzen <nielsvanvelzen@users.noreply.github.com> | 2026-01-27 11:31:27 +0100 |
|---|---|---|
| committer | GitHub <noreply@github.com> | 2026-01-27 11:31:27 +0100 |
| commit | 6ed633ce4231df06c71a4a8adefb3be5a1d222f7 (patch) | |
| tree | c1a3fc6a2258d8aa0835a42ade3142baa8d2caf0 | |
| parent | cf7150cd9d23b77feb7cc45dc4c9cb89e2491ede (diff) | |
| parent | d95bab41a1d6f030ed5ef174109573104a561b9e (diff) | |
Merge pull request #15604 from dkanada/opf-metadata
extract local metadata from OPF and EPUB files
5 files changed, 676 insertions, 0 deletions
diff --git a/MediaBrowser.Providers/Books/OpenPackagingFormat/EpubImageProvider.cs b/MediaBrowser.Providers/Books/OpenPackagingFormat/EpubImageProvider.cs new file mode 100644 index 000000000..33d2823de --- /dev/null +++ b/MediaBrowser.Providers/Books/OpenPackagingFormat/EpubImageProvider.cs @@ -0,0 +1,118 @@ +using System; +using System.Collections.Generic; +using System.IO; +using System.IO.Compression; +using System.Threading; +using System.Threading.Tasks; +using System.Xml; +using MediaBrowser.Controller.Entities; +using MediaBrowser.Controller.Providers; +using MediaBrowser.Model.Entities; +using Microsoft.Extensions.Logging; + +namespace MediaBrowser.Providers.Books.OpenPackagingFormat +{ + /// <summary> + /// Provides the primary image for EPUB items that have embedded covers. + /// </summary> + public class EpubImageProvider : IDynamicImageProvider + { + private readonly ILogger<EpubImageProvider> _logger; + + /// <summary> + /// Initializes a new instance of the <see cref="EpubImageProvider"/> class. + /// </summary> + /// <param name="logger">Instance of the <see cref="ILogger{EpubImageProvider}"/> interface.</param> + public EpubImageProvider(ILogger<EpubImageProvider> logger) + { + _logger = logger; + } + + /// <inheritdoc /> + public string Name => "EPUB Metadata"; + + /// <inheritdoc /> + public bool Supports(BaseItem item) + { + return item is Book; + } + + /// <inheritdoc /> + public IEnumerable<ImageType> GetSupportedImages(BaseItem item) + { + yield return ImageType.Primary; + } + + /// <inheritdoc /> + public Task<DynamicImageResponse> GetImage(BaseItem item, ImageType type, CancellationToken cancellationToken) + { + if (string.Equals(Path.GetExtension(item.Path), ".epub", StringComparison.OrdinalIgnoreCase)) + { + return GetFromZip(item); + } + + return Task.FromResult(new DynamicImageResponse { HasImage = false }); + } + + private async Task<DynamicImageResponse> LoadCover(ZipArchive epub, XmlDocument opf, string opfRootDirectory) + { + var utilities = new OpfReader<EpubImageProvider>(opf, _logger); + var coverReference = utilities.ReadCoverPath(opfRootDirectory); + if (coverReference == null) + { + return new DynamicImageResponse { HasImage = false }; + } + + var cover = coverReference.Value; + var coverFile = epub.GetEntry(cover.Path); + + if (coverFile == null) + { + return new DynamicImageResponse { HasImage = false }; + } + + var memoryStream = new MemoryStream(); + using (var coverStream = coverFile.Open()) + { + await coverStream.CopyToAsync(memoryStream).ConfigureAwait(false); + } + + memoryStream.Position = 0; + + var response = new DynamicImageResponse { HasImage = true, Stream = memoryStream }; + response.SetFormatFromMimeType(cover.MimeType); + + return response; + } + + private async Task<DynamicImageResponse> GetFromZip(BaseItem item) + { + using var epub = ZipFile.OpenRead(item.Path); + + var opfFilePath = EpubUtils.ReadContentFilePath(epub); + if (opfFilePath == null) + { + return new DynamicImageResponse { HasImage = false }; + } + + var opfRootDirectory = Path.GetDirectoryName(opfFilePath); + if (opfRootDirectory == null) + { + return new DynamicImageResponse { HasImage = false }; + } + + var opfFile = epub.GetEntry(opfFilePath); + if (opfFile == null) + { + return new DynamicImageResponse { HasImage = false }; + } + + using var opfStream = opfFile.Open(); + + var opfDocument = new XmlDocument(); + opfDocument.Load(opfStream); + + return await LoadCover(epub, opfDocument, opfRootDirectory).ConfigureAwait(false); + } + } +} diff --git a/MediaBrowser.Providers/Books/OpenPackagingFormat/EpubProvider.cs b/MediaBrowser.Providers/Books/OpenPackagingFormat/EpubProvider.cs new file mode 100644 index 000000000..bc77e5928 --- /dev/null +++ b/MediaBrowser.Providers/Books/OpenPackagingFormat/EpubProvider.cs @@ -0,0 +1,100 @@ +using System; +using System.IO; +using System.IO.Compression; +using System.Threading; +using System.Threading.Tasks; +using System.Xml; +using MediaBrowser.Controller.Entities; +using MediaBrowser.Controller.Providers; +using MediaBrowser.Model.IO; +using Microsoft.Extensions.Logging; + +namespace MediaBrowser.Providers.Books.OpenPackagingFormat +{ + /// <summary> + /// Provides book metadata from OPF content in an EPUB item. + /// </summary> + public class EpubProvider : ILocalMetadataProvider<Book> + { + private readonly IFileSystem _fileSystem; + private readonly ILogger<EpubProvider> _logger; + + /// <summary> + /// Initializes a new instance of the <see cref="EpubProvider"/> class. + /// </summary> + /// <param name="fileSystem">Instance of the <see cref="IFileSystem"/> interface.</param> + /// <param name="logger">Instance of the <see cref="ILogger{EpubProvider}"/> interface.</param> + public EpubProvider(IFileSystem fileSystem, ILogger<EpubProvider> logger) + { + _fileSystem = fileSystem; + _logger = logger; + } + + /// <inheritdoc /> + public string Name => "EPUB Metadata"; + + /// <inheritdoc /> + public Task<MetadataResult<Book>> GetMetadata(ItemInfo info, IDirectoryService directoryService, CancellationToken cancellationToken) + { + var path = GetEpubFile(info.Path)?.FullName; + + if (path is null) + { + return Task.FromResult(new MetadataResult<Book> { HasMetadata = false }); + } + + var result = ReadEpubAsZip(path, cancellationToken); + + if (result is null) + { + return Task.FromResult(new MetadataResult<Book> { HasMetadata = false }); + } + else + { + return Task.FromResult(result); + } + } + + private FileSystemMetadata? GetEpubFile(string path) + { + var fileInfo = _fileSystem.GetFileSystemInfo(path); + + if (fileInfo.IsDirectory) + { + return null; + } + + if (!string.Equals(Path.GetExtension(fileInfo.FullName), ".epub", StringComparison.OrdinalIgnoreCase)) + { + return null; + } + + return fileInfo; + } + + private MetadataResult<Book>? ReadEpubAsZip(string path, CancellationToken cancellationToken) + { + using var epub = ZipFile.OpenRead(path); + + var opfFilePath = EpubUtils.ReadContentFilePath(epub); + if (opfFilePath == null) + { + return null; + } + + var opf = epub.GetEntry(opfFilePath); + if (opf == null) + { + return null; + } + + using var opfStream = opf.Open(); + + var opfDocument = new XmlDocument(); + opfDocument.Load(opfStream); + + var utilities = new OpfReader<EpubProvider>(opfDocument, _logger); + return utilities.ReadOpfData(cancellationToken); + } + } +} diff --git a/MediaBrowser.Providers/Books/OpenPackagingFormat/EpubUtils.cs b/MediaBrowser.Providers/Books/OpenPackagingFormat/EpubUtils.cs new file mode 100644 index 000000000..e5d298731 --- /dev/null +++ b/MediaBrowser.Providers/Books/OpenPackagingFormat/EpubUtils.cs @@ -0,0 +1,35 @@ +using System.IO; +using System.IO.Compression; +using System.Linq; +using System.Xml.Linq; + +namespace MediaBrowser.Providers.Books.OpenPackagingFormat +{ + /// <summary> + /// Utilities for EPUB files. + /// </summary> + public static class EpubUtils + { + /// <summary> + /// Attempt to read content from ZIP archive. + /// </summary> + /// <param name="epub">The ZIP archive.</param> + /// <returns>The content file path.</returns> + public static string? ReadContentFilePath(ZipArchive epub) + { + var container = epub.GetEntry(Path.Combine("META-INF", "container.xml")); + if (container == null) + { + return null; + } + + using var containerStream = container.Open(); + + XNamespace containerNamespace = "urn:oasis:names:tc:opendocument:xmlns:container"; + var containerDocument = XDocument.Load(containerStream); + var element = containerDocument.Descendants(containerNamespace + "rootfile").FirstOrDefault(); + + return element?.Attribute("full-path")?.Value; + } + } +} diff --git a/MediaBrowser.Providers/Books/OpenPackagingFormat/OpfProvider.cs b/MediaBrowser.Providers/Books/OpenPackagingFormat/OpfProvider.cs new file mode 100644 index 000000000..6e678802c --- /dev/null +++ b/MediaBrowser.Providers/Books/OpenPackagingFormat/OpfProvider.cs @@ -0,0 +1,94 @@ +using System.IO; +using System.Threading; +using System.Threading.Tasks; +using System.Xml; +using MediaBrowser.Controller.Entities; +using MediaBrowser.Controller.Providers; +using MediaBrowser.Model.IO; +using Microsoft.Extensions.Logging; + +namespace MediaBrowser.Providers.Books.OpenPackagingFormat +{ + /// <summary> + /// Provides metadata for book items that have an OPF file in the same directory. Supports the standard + /// content.opf filename, bespoke metadata.opf name from Calibre libraries, and OPF files that have the + /// same name as their respective books for directories with several books. + /// </summary> + public class OpfProvider : ILocalMetadataProvider<Book>, IHasItemChangeMonitor + { + private const string StandardOpfFile = "content.opf"; + private const string CalibreOpfFile = "metadata.opf"; + + private readonly IFileSystem _fileSystem; + + private readonly ILogger<OpfProvider> _logger; + + /// <summary> + /// Initializes a new instance of the <see cref="OpfProvider"/> class. + /// </summary> + /// <param name="fileSystem">Instance of the <see cref="IFileSystem"/> interface.</param> + /// <param name="logger">Instance of the <see cref="ILogger{OpfProvider}"/> interface.</param> + public OpfProvider(IFileSystem fileSystem, ILogger<OpfProvider> logger) + { + _fileSystem = fileSystem; + _logger = logger; + } + + /// <inheritdoc /> + public string Name => "Open Packaging Format"; + + /// <inheritdoc /> + public bool HasChanged(BaseItem item, IDirectoryService directoryService) + { + var file = GetXmlFile(item.Path); + + return file.Exists && _fileSystem.GetLastWriteTimeUtc(file) > item.DateLastSaved; + } + + /// <inheritdoc /> + public Task<MetadataResult<Book>> GetMetadata(ItemInfo info, IDirectoryService directoryService, CancellationToken cancellationToken) + { + var path = GetXmlFile(info.Path).FullName; + + try + { + return Task.FromResult(ReadOpfData(path, cancellationToken)); + } + catch (FileNotFoundException) + { + return Task.FromResult(new MetadataResult<Book> { HasMetadata = false }); + } + } + + private FileSystemMetadata GetXmlFile(string path) + { + var fileInfo = _fileSystem.GetFileSystemInfo(path); + var directoryInfo = fileInfo.IsDirectory ? fileInfo : _fileSystem.GetDirectoryInfo(Path.GetDirectoryName(path)!); + + // check for OPF with matching name first since it's the most specific filename + var specificFile = Path.Combine(directoryInfo.FullName, Path.GetFileNameWithoutExtension(path) + ".opf"); + var file = _fileSystem.GetFileInfo(specificFile); + + if (file.Exists) + { + return file; + } + + file = _fileSystem.GetFileInfo(Path.Combine(directoryInfo.FullName, StandardOpfFile)); + + // check metadata.opf last since it's really only used by Calibre + return file.Exists ? file : _fileSystem.GetFileInfo(Path.Combine(directoryInfo.FullName, CalibreOpfFile)); + } + + private MetadataResult<Book> ReadOpfData(string file, CancellationToken cancellationToken) + { + cancellationToken.ThrowIfCancellationRequested(); + + var doc = new XmlDocument(); + doc.Load(file); + + var utilities = new OpfReader<OpfProvider>(doc, _logger); + return utilities.ReadOpfData(cancellationToken); + } + } +} diff --git a/MediaBrowser.Providers/Books/OpenPackagingFormat/OpfReader.cs b/MediaBrowser.Providers/Books/OpenPackagingFormat/OpfReader.cs new file mode 100644 index 000000000..5d202c59e --- /dev/null +++ b/MediaBrowser.Providers/Books/OpenPackagingFormat/OpfReader.cs @@ -0,0 +1,329 @@ +using System; +using System.Globalization; +using System.IO; +using System.Linq; +using System.Threading; +using System.Xml; +using Jellyfin.Data.Enums; +using MediaBrowser.Controller.Entities; +using MediaBrowser.Controller.Providers; +using MediaBrowser.Model.Entities; +using MediaBrowser.Model.Net; +using Microsoft.Extensions.Logging; + +namespace MediaBrowser.Providers.Books.OpenPackagingFormat +{ + /// <summary> + /// Methods used to pull metadata and other information from Open Packaging Format in XML objects. + /// </summary> + /// <typeparam name="TCategoryName">The type of category.</typeparam> + public class OpfReader<TCategoryName> + { + private const string DcNamespace = @"http://purl.org/dc/elements/1.1/"; + private const string OpfNamespace = @"http://www.idpf.org/2007/opf"; + + private readonly XmlNamespaceManager _namespaceManager; + private readonly XmlDocument _document; + + private readonly ILogger<TCategoryName> _logger; + + /// <summary> + /// Initializes a new instance of the <see cref="OpfReader{TCategoryName}"/> class. + /// </summary> + /// <param name="document">The XML document to parse.</param> + /// <param name="logger">Instance of the <see cref="ILogger{TCategoryName}"/> interface.</param> + public OpfReader(XmlDocument document, ILogger<TCategoryName> logger) + { + _document = document; + _logger = logger; + _namespaceManager = new XmlNamespaceManager(_document.NameTable); + + _namespaceManager.AddNamespace("dc", DcNamespace); + _namespaceManager.AddNamespace("opf", OpfNamespace); + } + + /// <summary> + /// Checks for the existence of a cover image. + /// </summary> + /// <param name="opfRootDirectory">The root directory in which the OPF file is located.</param> + /// <returns>Returns the found cover and its type or null.</returns> + public (string MimeType, string Path)? ReadCoverPath(string opfRootDirectory) + { + var coverImage = ReadEpubCoverInto(opfRootDirectory, "//opf:item[@properties='cover-image']"); + if (coverImage is not null) + { + return coverImage; + } + + var coverId = ReadEpubCoverInto(opfRootDirectory, "//opf:item[@id='cover' and @media-type='image/*']"); + if (coverId is not null) + { + return coverId; + } + + var coverImageId = ReadEpubCoverInto(opfRootDirectory, "//opf:item[@id='*cover-image']"); + if (coverImageId is not null) + { + return coverImageId; + } + + var metaCoverImage = _document.SelectSingleNode("//opf:meta[@name='cover']", _namespaceManager); + var content = metaCoverImage?.Attributes?["content"]?.Value; + if (string.IsNullOrEmpty(content) || metaCoverImage is null) + { + return null; + } + + var coverPath = Path.Combine("Images", content); + var coverFileManifest = _document.SelectSingleNode($"//opf:item[@href='{coverPath}']", _namespaceManager); + var mediaType = coverFileManifest?.Attributes?["media-type"]?.Value; + if (coverFileManifest?.Attributes is not null && !string.IsNullOrEmpty(mediaType) && IsValidImage(mediaType)) + { + return (mediaType, Path.Combine(opfRootDirectory, coverPath)); + } + + var coverFileIdManifest = _document.SelectSingleNode($"//opf:item[@id='{content}']", _namespaceManager); + if (coverFileIdManifest is not null) + { + return ReadManifestItem(coverFileIdManifest, opfRootDirectory); + } + + return null; + } + + /// <summary> + /// Read all supported OPF data from the file. + /// </summary> + /// <param name="cancellationToken">The cancellation token.</param> + /// <returns>The metadata result to update.</returns> + public MetadataResult<Book> ReadOpfData(CancellationToken cancellationToken) + { + cancellationToken.ThrowIfCancellationRequested(); + + var book = CreateBookFromOpf(); + var result = new MetadataResult<Book> { Item = book, HasMetadata = true }; + + FindAuthors(result); + ReadStringInto("//dc:language", language => result.ResultLanguage = language); + + return result; + } + + private Book CreateBookFromOpf() + { + var book = new Book + { + Name = FindMainTitle(), + ForcedSortName = FindSortTitle(), + }; + + ReadStringInto("//dc:description", summary => book.Overview = summary); + ReadStringInto("//dc:publisher", publisher => book.AddStudio(publisher)); + ReadStringInto("//dc:identifier[@opf:scheme='AMAZON']", amazon => book.SetProviderId("Amazon", amazon)); + ReadStringInto("//dc:identifier[@opf:scheme='GOOGLE']", google => book.SetProviderId("GoogleBooks", google)); + ReadStringInto("//dc:identifier[@opf:scheme='ISBN']", isbn => book.SetProviderId("ISBN", isbn)); + + ReadStringInto("//dc:date", date => + { + if (DateTime.TryParse(date, out var dateValue)) + { + book.PremiereDate = dateValue.Date; + book.ProductionYear = dateValue.Date.Year; + } + }); + + var genreNodes = _document.SelectNodes("//dc:subject", _namespaceManager); + + if (genreNodes?.Count > 0) + { + foreach (var node in genreNodes.Cast<XmlNode>().Where(node => !string.IsNullOrEmpty(node.InnerText) && !book.Genres.Contains(node.InnerText))) + { + // specification has no rules about content and some books combine every genre into a single element + foreach (var item in node.InnerText.Split(["/", "&", ",", ";", " - "], StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)) + { + book.AddGenre(item); + } + } + } + + ReadInt32AttributeInto("//opf:meta[@name='calibre:series_index']", index => book.IndexNumber = index); + ReadInt32AttributeInto("//opf:meta[@name='calibre:rating']", rating => book.CommunityRating = rating); + + var seriesNameNode = _document.SelectSingleNode("//opf:meta[@name='calibre:series']", _namespaceManager); + + if (!string.IsNullOrEmpty(seriesNameNode?.Attributes?["content"]?.Value)) + { + try + { + book.SeriesName = seriesNameNode.Attributes["content"]?.Value; + } + catch (Exception) + { + _logger.LogError("error parsing Calibre series name"); + } + } + + return book; + } + + private string FindMainTitle() + { + var title = string.Empty; + var titleTypes = _document.SelectNodes("//opf:meta[@property='title-type']", _namespaceManager); + + if (titleTypes is not null && titleTypes.Count > 0) + { + foreach (XmlElement titleNode in titleTypes) + { + string refines = titleNode.GetAttribute("refines").TrimStart('#'); + string titleType = titleNode.InnerText; + + var titleElement = _document.SelectSingleNode($"//dc:title[@id='{refines}']", _namespaceManager); + if (titleElement is not null && string.Equals(titleType, "main", StringComparison.OrdinalIgnoreCase)) + { + title = titleElement.InnerText; + } + } + } + + // fallback in case there is no main title definition + if (string.IsNullOrEmpty(title)) + { + ReadStringInto("//dc:title", titleString => title = titleString); + } + + return title; + } + + private string? FindSortTitle() + { + var titleTypes = _document.SelectNodes("//opf:meta[@property='file-as']", _namespaceManager); + + if (titleTypes is not null && titleTypes.Count > 0) + { + foreach (XmlElement titleNode in titleTypes) + { + string refines = titleNode.GetAttribute("refines").TrimStart('#'); + string sortTitle = titleNode.InnerText; + + var titleElement = _document.SelectSingleNode($"//dc:title[@id='{refines}']", _namespaceManager); + if (titleElement is not null) + { + return sortTitle; + } + } + } + + // search for OPF 2.0 style title_sort node + var resultElement = _document.SelectSingleNode("//opf:meta[@name='calibre:title_sort']", _namespaceManager); + var titleSort = resultElement?.Attributes?["content"]?.Value; + + return titleSort; + } + + private void FindAuthors(MetadataResult<Book> book) + { + var resultElement = _document.SelectNodes("//dc:creator", _namespaceManager); + + if (resultElement != null && resultElement.Count > 0) + { + foreach (XmlElement creator in resultElement) + { + var creatorName = creator.InnerText; + var role = creator.GetAttribute("opf:role"); + var person = new PersonInfo { Name = creatorName, Type = GetRole(role) }; + + book.AddPerson(person); + } + } + } + + private PersonKind GetRole(string? role) + { + switch (role) + { + case "arr": + return PersonKind.Arranger; + case "art": + return PersonKind.Artist; + case "aut": + case "aqt": + case "aft": + case "aui": + default: + return PersonKind.Author; + case "edt": + return PersonKind.Editor; + case "ill": + return PersonKind.Illustrator; + case "lyr": + return PersonKind.Lyricist; + case "mus": + return PersonKind.AlbumArtist; + case "oth": + return PersonKind.Unknown; + case "trl": + return PersonKind.Translator; + } + } + + private void ReadStringInto(string xmlPath, Action<string> commitResult) + { + var resultElement = _document.SelectSingleNode(xmlPath, _namespaceManager); + if (resultElement is not null && !string.IsNullOrWhiteSpace(resultElement.InnerText)) + { + commitResult(resultElement.InnerText); + } + } + + private void ReadInt32AttributeInto(string xmlPath, Action<int> commitResult) + { + var resultElement = _document.SelectSingleNode(xmlPath, _namespaceManager); + var resultValue = resultElement?.Attributes?["content"]?.Value; + + if (!string.IsNullOrEmpty(resultValue)) + { + try + { + commitResult(Convert.ToInt32(Convert.ToDouble(resultValue, CultureInfo.InvariantCulture))); + } + catch (Exception e) + { + _logger.LogError(e, "error converting to Int32"); + } + } + } + + private (string MimeType, string Path)? ReadEpubCoverInto(string opfRootDirectory, string xmlPath) + { + var resultElement = _document.SelectSingleNode(xmlPath, _namespaceManager); + + if (resultElement is not null) + { + return ReadManifestItem(resultElement, opfRootDirectory); + } + + return null; + } + + private (string MimeType, string Path)? ReadManifestItem(XmlNode manifestNode, string opfRootDirectory) + { + var href = manifestNode.Attributes?["href"]?.Value; + var mediaType = manifestNode.Attributes?["media-type"]?.Value; + + if (string.IsNullOrEmpty(href) || string.IsNullOrEmpty(mediaType) || !IsValidImage(mediaType)) + { + return null; + } + + var coverPath = Path.Combine(opfRootDirectory, href); + + return (MimeType: mediaType, Path: coverPath); + } + + private static bool IsValidImage(string? mimeType) + { + return !string.IsNullOrEmpty(mimeType) && !string.IsNullOrWhiteSpace(MimeTypes.ToExtension(mimeType)); + } + } +} |
