Scrape rendered HTML with .NET6 C#

using HtmlAgilityPack;
using OpenQA.Selenium;
using OpenQA.Selenium.Chrome;
using OpenQA.Selenium.PhantomJS;

namespace Christopher.Snay.Sample.Services.Scrapers
{
	internal class ChromeScraper : IChromeScraper
	{
		public HtmlDocument ScrapeHtml(Uri url)
		{
			return ScrapeHtml(url.ToString());
		}

		public HtmlDocument ScrapeHtml(string url)
		{
			HtmlDocument doc = new();

			using (PhantomJSDriverService driverService = PhantomJSDriverService.CreateDefaultService())
			{
				driverService.HideCommandPromptWindow = true;
				driverService.LoadImages = false;
				driverService.IgnoreSslErrors = true;

				driverService.Start();

				using IWebDriver driver = new ChromeDriver();

				driver.Navigate().GoToUrl(url);

				Thread.Sleep(3000);

				doc.LoadHtml(driver.PageSource);
			}

			return doc;
		}
	}

	public interface IChromeScraper
	{
		HtmlDocument ScrapeHtml(string url);
		HtmlDocument ScrapeHtml(Uri url);
	}
}

Requirements – The following executables must be in /bin

  • phantonjs.exe
  • chrome.exe
  • chromedriver.exe
  • + the chrome portable binaries directory, currently named \107.0.5304.88
<!-- To copy directly to bin without being placed in a sub-folder -->
<ItemGroup>
	<ContentWithTargetPath Include="Assets\phantomjs.exe">
		<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
		<TargetPath>phantomjs.exe</TargetPath>
	</ContentWithTargetPath>
	<ContentWithTargetPath Include="Assets\chrome.exe">
		<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
		<TargetPath>chrome.exe</TargetPath>
	</ContentWithTargetPath>
	<ContentWithTargetPath Include="Assets\chromedriver.exe">
		<CopyToOutputDirectory>PreserveNewest</CopyToOutputDirectory>
		<TargetPath>chromedriver.exe</TargetPath>
	</ContentWithTargetPath>

	<None Include="Assets\phantomjs.exe" />
	<None Include="Assets\chrome.exe" />
	<None Include="Assets\chromedriver.exe" />
</ItemGroup>

I’ve used Chrome portable to avoid having to install Chrome. If Chrome is installed, the chrome.exe steps can probably be skipped.