using System.Net; using System.Text; using AngleSharp; using AngleSharp.Dom; using Microsoft.Extensions.Caching.Memory; namespace Nuuru.Server.Services; public sealed class StandaloneLinkPreviewHtmlEnricher : IHtmlEnricher { public HtmlEnrichmentPhase Phase => HtmlEnrichmentPhase.Deferred; private const int MaxHtmlBytes = 1024 * 1024; private static readonly byte[] HeadCloseTagBytes = ""u8.ToArray(); private static readonly TimeSpan MetadataCacheDuration = TimeSpan.FromHours(12); private static readonly IBrowsingContext BrowsingContext = AngleSharp.BrowsingContext.New(Configuration.Default); private readonly IHttpClientFactory _httpClientFactory; private readonly IRemoteImageInfoService _remoteImageInfoService; private readonly IMemoryCache _cache; private readonly ILogger _logger; public StandaloneLinkPreviewHtmlEnricher( IHttpClientFactory httpClientFactory, IRemoteImageInfoService remoteImageInfoService, IMemoryCache cache, ILogger logger) { _httpClientFactory = httpClientFactory; _remoteImageInfoService = remoteImageInfoService; _cache = cache; _logger = logger; } public async Task EnrichAsync(IDocument document, HtmlEnrichmentContext context, CancellationToken cancellationToken = default) { if (context.Target != HtmlEnrichmentTarget.ForumPost) return false; var anchors = document.QuerySelectorAll("a.bbcode-link").OfType().ToList(); if (anchors.Count == 0) return false; var candidates = new List<(IElement Anchor, string Url)>(); foreach (var anchor in anchors) { var href = anchor.GetAttribute("href"); if (string.IsNullOrWhiteSpace(href) || !string.Equals(anchor.TextContent.Trim(), href, StringComparison.Ordinal)) continue; if (!Uri.TryCreate(href, UriKind.Absolute, out var uri)) continue; if (!await RemoteFetchSupport.IsSafeRemoteUriAsync(uri, cancellationToken)) continue; if (!IsStandaloneLine(anchor)) continue; candidates.Add((anchor, href)); } if (candidates.Count == 0) return false; var metadataTasks = candidates .Select(candidate => candidate.Url) .Distinct(StringComparer.Ordinal) .ToDictionary( url => url, url => GetMetadataAsync(url, cancellationToken), StringComparer.Ordinal); await Task.WhenAll(metadataTasks.Values); var changed = false; foreach (var candidate in candidates) { var metadata = await metadataTasks[candidate.Url]; if (metadata == null) continue; var preview = BuildPreviewElement(document, candidate.Url, metadata); if (candidate.Anchor.NextSibling != null) candidate.Anchor.Parent?.InsertBefore(preview, candidate.Anchor.NextSibling); else candidate.Anchor.Parent?.AppendChild(preview); changed = true; } return changed; } private async Task GetMetadataAsync(string url, CancellationToken cancellationToken) { var cacheKey = $"link-preview:{url}"; if (_cache.TryGetValue(cacheKey, out var cached)) return cached; LinkPreviewMetadata? metadata = null; try { metadata = await FetchMetadataAsync(url, cancellationToken); } catch (Exception ex) { _logger.LogDebug(ex, "Failed to fetch link preview metadata for {Url}", url); } _cache.Set(cacheKey, metadata, MetadataCacheDuration); return metadata; } private async Task FetchMetadataAsync(string url, CancellationToken cancellationToken) { if (!Uri.TryCreate(url, UriKind.Absolute, out var pageUri)) return null; var client = _httpClientFactory.CreateClient(); client.Timeout = TimeSpan.FromSeconds(5); using var request = new HttpRequestMessage(HttpMethod.Get, pageUri); RemoteFetchSupport.ApplyBrowserHeaders(request, pageUri, isImageRequest: false); using var response = await client.SendAsync(request, HttpCompletionOption.ResponseHeadersRead, cancellationToken); if (!response.IsSuccessStatusCode) return null; var mediaType = response.Content.Headers.ContentType?.MediaType; if (mediaType is not ("text/html" or "application/xhtml+xml")) return null; await using var stream = await response.Content.ReadAsStreamAsync(cancellationToken); using var buffer = new MemoryStream(); var chunk = new byte[8192]; while (buffer.Length < MaxHtmlBytes) { var bytesToRead = Math.Min(chunk.Length, MaxHtmlBytes - (int)buffer.Length); var read = await stream.ReadAsync(chunk.AsMemory(0, bytesToRead), cancellationToken); if (read == 0) break; await buffer.WriteAsync(chunk.AsMemory(0, read), cancellationToken); if (ContainsHeadCloseTag(buffer)) break; } var html = GetContentString(buffer.ToArray(), response.Content.Headers.ContentType?.CharSet); var remoteDocument = await BrowsingContext.OpenAsync(req => req.Content(html).Address(pageUri), cancellationToken); var title = GetMetaContent(remoteDocument, "property", "og:title") ?? GetMetaContent(remoteDocument, "name", "twitter:title") ?? remoteDocument.Title ?? pageUri.Host; var description = GetMetaContent(remoteDocument, "property", "og:description") ?? GetMetaContent(remoteDocument, "name", "description") ?? GetMetaContent(remoteDocument, "name", "twitter:description"); var siteName = GetMetaContent(remoteDocument, "property", "og:site_name") ?? pageUri.Host; var faviconUrl = ResolveAssetUrl(remoteDocument.QuerySelector("link[rel~='icon'], link[rel='shortcut icon'], link[rel='apple-touch-icon']")?.GetAttribute("href"), remoteDocument.BaseUri); var imageUrl = ResolveAssetUrl( GetMetaContent(remoteDocument, "property", "og:image") ?? GetMetaContent(remoteDocument, "name", "twitter:image"), remoteDocument.BaseUri); var favicon = await _remoteImageInfoService.GetAsync(faviconUrl, 64 * 1024, cancellationToken); var image = await _remoteImageInfoService.GetAsync(imageUrl, 256 * 1024, cancellationToken); return new LinkPreviewMetadata( CollapseWhitespace(WebUtility.HtmlDecode(title)), CollapseWhitespace(WebUtility.HtmlDecode(description ?? string.Empty)), CollapseWhitespace(WebUtility.HtmlDecode(siteName)), favicon?.DataUrl, image?.DataUrl); } private static string GetContentString(byte[] bytes, string? charset) { if (!string.IsNullOrWhiteSpace(charset)) { try { return Encoding.GetEncoding(charset.Trim('"', '\'')).GetString(bytes); } catch { } } return Encoding.UTF8.GetString(bytes); } private static bool ContainsHeadCloseTag(MemoryStream buffer) { var span = buffer.GetBuffer().AsSpan(0, (int)buffer.Length); var maxStart = span.Length - HeadCloseTagBytes.Length; if (maxStart < 0) return false; for (var i = 0; i <= maxStart; i++) { var matches = true; for (var j = 0; j < HeadCloseTagBytes.Length; j++) { if (ToAsciiLower(span[i + j]) != HeadCloseTagBytes[j]) { matches = false; break; } } if (matches) return true; } return false; } private static byte ToAsciiLower(byte value) { return value is >= (byte)'A' and <= (byte)'Z' ? (byte)(value + 32) : value; } private static string? GetMetaContent(IDocument document, string attributeName, string attributeValue) { return document .QuerySelectorAll("meta") .FirstOrDefault(meta => string.Equals(meta.GetAttribute(attributeName), attributeValue, StringComparison.OrdinalIgnoreCase)) ?.GetAttribute("content"); } private static string? ResolveAssetUrl(string? rawUrl, string? baseUrl) { if (string.IsNullOrWhiteSpace(rawUrl) || string.IsNullOrWhiteSpace(baseUrl)) return null; if (!Uri.TryCreate(baseUrl, UriKind.Absolute, out var baseUri)) return null; if (!Uri.TryCreate(baseUri, WebUtility.HtmlDecode(rawUrl), out var resolved)) return null; return resolved.Scheme is "http" or "https" ? resolved.ToString() : null; } private static bool IsStandaloneLine(INode anchor) { for (var node = anchor.PreviousSibling; node != null; node = node.PreviousSibling) { if (IsLineBreak(node)) break; if (!IsWhitespaceNode(node)) return false; } for (var node = anchor.NextSibling; node != null; node = node.NextSibling) { if (IsLineBreak(node)) break; if (!IsWhitespaceNode(node)) return false; } return true; } private static bool IsLineBreak(INode node) { return node is IElement element && element.TagName.Equals("BR", StringComparison.OrdinalIgnoreCase); } private static bool IsWhitespaceNode(INode node) { return string.IsNullOrWhiteSpace(node.TextContent); } private static IElement BuildPreviewElement(IDocument document, string url, LinkPreviewMetadata metadata) { var wrapper = document.CreateElement("div"); wrapper.ClassName = "bbcode-link-preview"; var card = document.CreateElement("a"); card.ClassName = "bbcode-link-preview-card"; card.SetAttribute("href", url); card.SetAttribute("rel", "nofollow noopener"); card.SetAttribute("target", "_blank"); wrapper.AppendChild(card); if (!string.IsNullOrWhiteSpace(metadata.ImageDataUrl)) { var image = document.CreateElement("img"); image.ClassName = "bbcode-link-preview-image"; image.SetAttribute("src", metadata.ImageDataUrl!); image.SetAttribute("alt", string.Empty); image.SetAttribute("loading", "lazy"); card.AppendChild(image); } var body = document.CreateElement("span"); body.ClassName = "bbcode-link-preview-body"; card.AppendChild(body); var head = document.CreateElement("span"); head.ClassName = "bbcode-link-preview-head"; body.AppendChild(head); if (!string.IsNullOrWhiteSpace(metadata.FaviconDataUrl)) { var favicon = document.CreateElement("img"); favicon.ClassName = "bbcode-link-preview-favicon"; favicon.SetAttribute("src", metadata.FaviconDataUrl!); favicon.SetAttribute("alt", string.Empty); favicon.SetAttribute("loading", "lazy"); head.AppendChild(favicon); } var site = document.CreateElement("span"); site.ClassName = "bbcode-link-preview-site"; site.TextContent = string.IsNullOrWhiteSpace(metadata.SiteName) ? new Uri(url).Host : metadata.SiteName; head.AppendChild(site); var title = document.CreateElement("span"); title.ClassName = "bbcode-link-preview-title"; title.TextContent = string.IsNullOrWhiteSpace(metadata.Title) ? url : metadata.Title; body.AppendChild(title); if (!string.IsNullOrWhiteSpace(metadata.Description)) { var description = document.CreateElement("span"); description.ClassName = "bbcode-link-preview-description"; description.TextContent = metadata.Description; body.AppendChild(description); } return wrapper; } private static string CollapseWhitespace(string value) { return string.Join(" ", (value ?? string.Empty) .Split((char[]?)null, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries)); } private sealed record LinkPreviewMetadata( string Title, string Description, string SiteName, string? FaviconDataUrl, string? ImageDataUrl); }