// Copyright (C) 2024 Vaughn Nugent // // This program is free software: you can redistribute it and/or modify // it under the terms of the GNU Affero General Public License as // published by the Free Software Foundation, either version 3 of the // License, or (at your option) any later version. // // This program is distributed in the hope that it will be useful, // but WITHOUT ANY WARRANTY; without even the implied warranty of // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the // GNU Affero General Public License for more details. // // You should have received a copy of the GNU Affero General Public License // along with this program. If not, see . using System; using System.IO; using System.Text; using System.Threading; using System.Diagnostics; using System.ComponentModel; using System.Threading.Tasks; using System.Collections.Generic; using VNLib.Utils.Memory; using VNLib.Utils.Extensions; namespace SimpleBookmark.PlatformFeatures.Curl { sealed class SystemCurlApp(string exePath, bool httpsOnly, string[] additionalArgs) : ISystemApp, ICurlApp { const int DefaultTimeoutMs = 5000; /// public async Task TestIsAvailable(CancellationToken cancellation) { try { //Test if the curl application is available on the local system, may be at path using Process? process = Exec(["--version"]); if (process is null) { return false; } //Wait for the process to exit await process.WaitForExitAsync(cancellation); //If an ok status code, then we know the curl application is available return process.ExitCode == 0; } //App not found catch (Win32Exception) { return false; } } private Process? Exec(string[] arguments) { ProcessStartInfo startInfo = new() { FileName = exePath, RedirectStandardOutput = true, RedirectStandardError = true, UseShellExecute = false, CreateNoWindow = true, StandardOutputEncoding = Encoding.UTF8, StandardErrorEncoding = Encoding.UTF8 }; //Add arguments arguments.ForEach(startInfo.ArgumentList.Add); return Process.Start(startInfo); } private void ValidateUrl(Uri? website) { ArgumentNullException.ThrowIfNull(website); if (!website.IsAbsoluteUri) { throw new ArgumentException("The website url must be an absolute uri", nameof(website)); } if (httpsOnly && website.Scheme != Uri.UriSchemeHttps) { throw new ArgumentException("The website url must be an https url only!", nameof(website)); } else if (website.Scheme != Uri.UriSchemeHttp && website.Scheme != Uri.UriSchemeHttps) { //Http or https only throw new ArgumentException("The website url must be an http or https url", nameof(website)); } } /// public async Task ExecLookupAsync(Uri website, int? timeoutMs, CancellationToken cancellation) { //Validate the url ValidateUrl(website); string timeoutArg = timeoutMs.HasValue ? $"{timeoutMs.Value / 1000}" : $"{DefaultTimeoutMs / 1000}"; string[] args = [ "--max-time", timeoutArg, //Set the max time for the request "-S", //Silent mode is required "-H", "Accept: text/html,application/html", //Html is required ..additionalArgs, //Additional global arguments website.AbsoluteUri ]; //Execute the curl command using Process? process = Exec(args); if (process is null) { return new CurlResult(null, true, "Curl is not enabled on this platform, lookup failed"); } //Parse the html data Task documentHeadTask = HtmlTokenReader.ReadHeadTokenAsync(process.StandardOutput, cancellation); //Respect the user's timeout command and termimate the process if it exceeds the timeout if (timeoutMs.HasValue) { await documentHeadTask.WaitAsync(TimeSpan.FromMilliseconds(timeoutMs.Value)); await Task.WhenAll( DiscardStreamAsync(process.StandardOutput, cancellation), DiscardStreamAsync(process.StandardError, cancellation) ).WaitAsync(TimeSpan.FromMilliseconds(timeoutMs.Value)); } else { await documentHeadTask; await Task.WhenAll( DiscardStreamAsync(process.StandardOutput, cancellation), DiscardStreamAsync(process.StandardError, cancellation) ); } await process.WaitForExitAsync(cancellation); if (process.ExitCode != 0) { return new CurlResult(null, true, "Curl exited with a non-zero status code"); } string? documentHead = await documentHeadTask; if (documentHead is null) { return new CurlResult(null, true, "Failed to parse html data"); } //Get the lookup result from the document head segmetn WebsiteLookupResult result = HtmlTokenReader.ParseHtmlData(documentHead); return new CurlResult(result, false, null); } /// /// Safely discards the entire stream of data from the reader without /// allocating a large string buffer /// /// The reader to discard /// A token to cancel the operation /// A task that represents the discard opeartion private static async Task DiscardStreamAsync(TextReader reader, CancellationToken cancellation) { using ArrayPoolBuffer discarBuffer = new(8192); while (await reader.ReadBlockAsync(discarBuffer.AsMemory(), cancellation) > 0) { } } private static class HtmlTokenReader { /// /// Gets the document title from the head of the html document /// /// The head string containing the title to parse /// The title string if found public static string? GetDocTitleFromHead(string head) { ReadOnlySpan headChars = head.AsSpan(); ReadOnlySpan title = headChars.SliceAfterParam(""); title = title.SliceBeforeParam(""); return title.ToString(); } /// /// Attempts to get the document summary from the head of the html document /// in the meta description tag /// /// The head string to parse /// The document description if found public static string? GetDocumentSummary(string head) { ReadOnlySpan headChars = head.AsSpan(); ReadOnlySpan desc = headChars.SliceAfterParam(""); desc = desc.SliceBeforeParam("\">"); return desc.ToString(); } /// /// Attempts to get the document keywords from the head of the html document /// by parsing the meta keywords tag /// /// The document head /// An array of document keywords found from the head section public static string[]? GetDocumentKeywords(string head) { ReadOnlySpan headChars = head.AsSpan(); ReadOnlySpan kwStart = headChars.SliceAfterParam(" kwSpan = kwStart.SliceBeforeParam("\">"); List keywords = []; //Split the keywords at comma, and remove any empty entries/whitespace kwSpan.Split(',', keywords, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries); return keywords.ToArray(); } public static WebsiteLookupResult ParseHtmlData(string documentHead) { //Parse head segments for title, description, and keywords return new WebsiteLookupResult( title: GetDocTitleFromHead(documentHead), description: GetDocumentSummary(documentHead), keywords: GetDocumentKeywords(documentHead) ); } public static async Task ReadHeadTokenAsync(TextReader reader, CancellationToken cancellation) { //String buffer to store parsed head data StringBuilder stringBuilder = new(1024); //Temp copy buffer using ArrayPoolBuffer buffer = new(4096); bool isStart = true, isEnd = false; //scan for docuemnt head do { int read = await reader.ReadBlockAsync(buffer.AsMemory(), cancellation); if (read == 0) { //Read should never return 0, if it does, then there is no head to read return null; } if (isStart) { Memory headSpan = HeadStart(buffer.AsMemory()); //No head was found, continue buffering if (headSpan.IsEmpty) { continue; } /* * Try to find the end of the head, if it is found, then we can break */ isEnd = HeadEnd(ref headSpan); //Valid head data to buffer stringBuilder.Append(headSpan); isStart = false; } else { //Head start was already found, just need to buffer until it ends Memory end = buffer.AsMemory(); isEnd = HeadEnd(ref end); stringBuilder.Append(end); if (isEnd) { break; } } } while (!isEnd); return stringBuilder.ToString(); } static Memory HeadStart(Memory start) { //find start of head int headStartIndex = start.Span.IndexOf(""); if (headStartIndex == -1) { return default; } return start[headStartIndex..]; } static bool HeadEnd(ref Memory end) { //find end of head int headEndIndex = end.Span.IndexOf(""); if (headEndIndex == -1) { return false; } end = end[..headEndIndex]; return true; } } } }