// Copyright (C) 2024 Vaughn Nugent
//
// This program is free software: you can redistribute it and/or modify
// it under the terms of the GNU Affero General Public License as
// published by the Free Software Foundation, either version 3 of the
// License, or (at your option) any later version.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
// GNU Affero General Public License for more details.
//
// You should have received a copy of the GNU Affero General Public License
// along with this program. If not, see .
using System;
using System.IO;
using System.Text;
using System.Threading;
using System.Diagnostics;
using System.ComponentModel;
using System.Threading.Tasks;
using System.Collections.Generic;
using VNLib.Utils.Memory;
using VNLib.Utils.Extensions;
namespace SimpleBookmark.PlatformFeatures.Curl
{
sealed class SystemCurlApp(string exePath, bool httpsOnly, string[] additionalArgs) : ISystemApp, ICurlApp
{
const int DefaultTimeoutMs = 5000;
///
public async Task TestIsAvailable(CancellationToken cancellation)
{
try
{
//Test if the curl application is available on the local system, may be at path
using Process? process = Exec(["--version"]);
if (process is null)
{
return false;
}
//Wait for the process to exit
await process.WaitForExitAsync(cancellation);
//If an ok status code, then we know the curl application is available
return process.ExitCode == 0;
}
//App not found
catch (Win32Exception)
{
return false;
}
}
private Process? Exec(string[] arguments)
{
ProcessStartInfo startInfo = new()
{
FileName = exePath,
RedirectStandardOutput = true,
RedirectStandardError = true,
UseShellExecute = false,
CreateNoWindow = true,
StandardOutputEncoding = Encoding.UTF8,
StandardErrorEncoding = Encoding.UTF8
};
//Add arguments
arguments.ForEach(startInfo.ArgumentList.Add);
return Process.Start(startInfo);
}
private void ValidateUrl(Uri? website)
{
ArgumentNullException.ThrowIfNull(website);
if (!website.IsAbsoluteUri)
{
throw new ArgumentException("The website url must be an absolute uri", nameof(website));
}
if (httpsOnly && website.Scheme != Uri.UriSchemeHttps)
{
throw new ArgumentException("The website url must be an https url only!", nameof(website));
}
else if (website.Scheme != Uri.UriSchemeHttp && website.Scheme != Uri.UriSchemeHttps)
{
//Http or https only
throw new ArgumentException("The website url must be an http or https url", nameof(website));
}
}
///
public async Task ExecLookupAsync(Uri website, int? timeoutMs, CancellationToken cancellation)
{
//Validate the url
ValidateUrl(website);
string timeoutArg = timeoutMs.HasValue ? $"{timeoutMs.Value / 1000}" : $"{DefaultTimeoutMs / 1000}";
string[] args = [
"--max-time", timeoutArg, //Set the max time for the request
"-S", //Silent mode is required
"-H", "Accept: text/html,application/html", //Html is required
..additionalArgs, //Additional global arguments
website.AbsoluteUri
];
//Execute the curl command
using Process? process = Exec(args);
if (process is null)
{
return new CurlResult(null, true, "Curl is not enabled on this platform, lookup failed");
}
//Parse the html data
Task documentHeadTask = HtmlTokenReader.ReadHeadTokenAsync(process.StandardOutput, cancellation);
//Respect the user's timeout command and termimate the process if it exceeds the timeout
if (timeoutMs.HasValue)
{
await documentHeadTask.WaitAsync(TimeSpan.FromMilliseconds(timeoutMs.Value));
await Task.WhenAll(
DiscardStreamAsync(process.StandardOutput, cancellation),
DiscardStreamAsync(process.StandardError, cancellation)
).WaitAsync(TimeSpan.FromMilliseconds(timeoutMs.Value));
}
else
{
await documentHeadTask;
await Task.WhenAll(
DiscardStreamAsync(process.StandardOutput, cancellation),
DiscardStreamAsync(process.StandardError, cancellation)
);
}
await process.WaitForExitAsync(cancellation);
if (process.ExitCode != 0)
{
return new CurlResult(null, true, "Curl exited with a non-zero status code");
}
string? documentHead = await documentHeadTask;
if (documentHead is null)
{
return new CurlResult(null, true, "Failed to parse html data");
}
//Get the lookup result from the document head segmetn
WebsiteLookupResult result = HtmlTokenReader.ParseHtmlData(documentHead);
return new CurlResult(result, false, null);
}
///
/// Safely discards the entire stream of data from the reader without
/// allocating a large string buffer
///
/// The reader to discard
/// A token to cancel the operation
/// A task that represents the discard opeartion
private static async Task DiscardStreamAsync(TextReader reader, CancellationToken cancellation)
{
using ArrayPoolBuffer discarBuffer = new(8192);
while (await reader.ReadBlockAsync(discarBuffer.AsMemory(), cancellation) > 0)
{ }
}
private static class HtmlTokenReader
{
///
/// Gets the document title from the head of the html document
///
/// The head string containing the title to parse
/// The title string if found
public static string? GetDocTitleFromHead(string head)
{
ReadOnlySpan headChars = head.AsSpan();
ReadOnlySpan title = headChars.SliceAfterParam("");
title = title.SliceBeforeParam("");
return title.ToString();
}
///
/// Attempts to get the document summary from the head of the html document
/// in the meta description tag
///
/// The head string to parse
/// The document description if found
public static string? GetDocumentSummary(string head)
{
ReadOnlySpan headChars = head.AsSpan();
ReadOnlySpan desc = headChars.SliceAfterParam("");
desc = desc.SliceBeforeParam("\">");
return desc.ToString();
}
///
/// Attempts to get the document keywords from the head of the html document
/// by parsing the meta keywords tag
///
/// The document head
/// An array of document keywords found from the head section
public static string[]? GetDocumentKeywords(string head)
{
ReadOnlySpan headChars = head.AsSpan();
ReadOnlySpan kwStart = headChars.SliceAfterParam(" kwSpan = kwStart.SliceBeforeParam("\">");
List keywords = [];
//Split the keywords at comma, and remove any empty entries/whitespace
kwSpan.Split(',', keywords, StringSplitOptions.RemoveEmptyEntries | StringSplitOptions.TrimEntries);
return keywords.ToArray();
}
public static WebsiteLookupResult ParseHtmlData(string documentHead)
{
//Parse head segments for title, description, and keywords
return new WebsiteLookupResult(
title: GetDocTitleFromHead(documentHead),
description: GetDocumentSummary(documentHead),
keywords: GetDocumentKeywords(documentHead)
);
}
public static async Task ReadHeadTokenAsync(TextReader reader, CancellationToken cancellation)
{
//String buffer to store parsed head data
StringBuilder stringBuilder = new(1024);
//Temp copy buffer
using ArrayPoolBuffer buffer = new(4096);
bool isStart = true, isEnd = false;
//scan for docuemnt head
do
{
int read = await reader.ReadBlockAsync(buffer.AsMemory(), cancellation);
if (read == 0)
{
//Read should never return 0, if it does, then there is no head to read
return null;
}
if (isStart)
{
Memory headSpan = HeadStart(buffer.AsMemory());
//No head was found, continue buffering
if (headSpan.IsEmpty)
{
continue;
}
/*
* Try to find the end of the head, if it is found, then we can break
*/
isEnd = HeadEnd(ref headSpan);
//Valid head data to buffer
stringBuilder.Append(headSpan);
isStart = false;
}
else
{
//Head start was already found, just need to buffer until it ends
Memory end = buffer.AsMemory();
isEnd = HeadEnd(ref end);
stringBuilder.Append(end);
if (isEnd)
{
break;
}
}
} while (!isEnd);
return stringBuilder.ToString();
}
static Memory HeadStart(Memory start)
{
//find start of head
int headStartIndex = start.Span.IndexOf("");
if (headStartIndex == -1)
{
return default;
}
return start[headStartIndex..];
}
static bool HeadEnd(ref Memory end)
{
//find end of head
int headEndIndex = end.Span.IndexOf("");
if (headEndIndex == -1)
{
return false;
}
end = end[..headEndIndex];
return true;
}
}
}
}