Convert PDF to HTML in memory and get List with all images using C# and .NET

Complete code

C#
VB.Net

using System;
using System.IO;
using System.Collections.Generic;
using System.Drawing;
using SkiaSharp;

namespace Sample
{
    class Sample
    {
        static void Main(string[] args)
        {
            // Before starting, we recommend to get a free key:
            // https://sautinsoft.com/start-for-free/
            
            // Apply the key here:
            // SautinSoft.PdfFocus.SetLicense("...");
			
            ConvertPdfBytesToHtml();
        }

        private static void ConvertPdfBytesToHtml()
        {
            // We need files only for demonstration purposes.
            // The whole conversion process will be done in memory.
            string pdfFile = Path.GetFullPath(@"..\..\..\simple text.pdf");
            string htmlFile = "Result.htm";

            // This is the list with extracted images.
            // It will be filled by images after the conversion.
            List<SKBitmap> imgCollection = new List<SKBitmap>();
			
            // Convert PDF to HTML in memory
            SautinSoft.PdfFocus f = new SautinSoft.PdfFocus();


            // Let's force the component to store images inside HTML document
            // using base-64 encoding.
            // Thus the component will not use HDD.
            f.HtmlOptions.IncludeImageInHtml = true;
            f.HtmlOptions.Title = "Simple text";
            

            // Read a PDF document to byte array.
            // Assume that we already have the  PDF as array of bytes.
            byte[] pdf = File.ReadAllBytes(pdfFile);

            f.OpenPdf(pdf);

            if (f.PageCount > 0)
            {
                // Convert PDF to HTML in memory
                string htmlString = f.ToHtml(1, f.PageCount, imgCollection);

                // Save HTML to a file only for the demonstration purpose.
                if (htmlString != null)
                {
                    // Show info about images and save them
                    Console.WriteLine("After converting we've got {0} image(s):", imgCollection.Count);
                    DirectoryInfo imgDir = new DirectoryInfo("Extracted Images");
                    if (!imgDir.Exists)
                        imgDir.Create();

                    int count = 1;
                    foreach (SKBitmap img in imgCollection)
                    {
                        Console.WriteLine("\t {0,4} x {1,4} px", img.Width, img.Height);
                        string imageFileName = Path.Combine(imgDir.FullName, String.Format($"pict{count}.jpg"));
                        FileStream file = new FileStream(imageFileName, FileMode.Create);
                        img.Encode(file, SKEncodedImageFormat.Jpeg, 100);
                        count++;
                    }
                    // Open the result for demonstration purposes.
                    File.WriteAllText(htmlFile, htmlString);
                    System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(htmlFile) { UseShellExecute = true });
                    System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(imgDir.FullName) { UseShellExecute = true });
                }
            }
        }
    }
}

Download

Imports System
Imports System.IO
Imports System.Collections.Generic
Imports System.Drawing
Imports SkiaSharp

Namespace Sample
    Friend Class Sample
        Shared Sub Main(args As String())
            ' Before starting, we recommend to get a free key:
            ' https://sautinsoft.com/start-for-free/

            ' Apply the key here:
            ' SautinSoft.PdfFocus.SetLicense("...")

            ConvertPdfBytesToHtml()
        End Sub

        Private Shared Sub ConvertPdfBytesToHtml()
            ' We need files only for demonstration purposes.
            ' The whole conversion process will be done in memory.
            Dim pdfFile As String = Path.GetFullPath("..\..\..\simple text.pdf")
            Dim htmlFile As String = "Result.htm"

            ' This is the list with extracted images.
            ' It will be filled by images after the conversion.
            Dim imgCollection As New List(Of SKBitmap)()

            ' Convert PDF to HTML in memory
            Dim f As New SautinSoft.PdfFocus()

            ' Let's force the component to store images inside HTML document
            ' using base-64 encoding.
            ' Thus the component will not use HDD.
            f.HtmlOptions.IncludeImageInHtml = True
            f.HtmlOptions.Title = "Simple text"

            ' Read a PDF document to byte array.
            ' Assume that we already have the  PDF as array of bytes.
            Dim pdf As Byte() = File.ReadAllBytes(pdfFile)

            f.OpenPdf(pdf)

            If f.PageCount > 0 Then
                ' Convert PDF to HTML in memory
                Dim htmlString As String = f.ToHtml(1, f.PageCount, imgCollection)

                ' Save HTML to a file only for the demonstration purpose.
                If htmlString IsNot Nothing Then
                    ' Show info about images and save them
                    Console.WriteLine("After converting we've got {0} image(s):", imgCollection.Count)
                    Dim imgDir As New DirectoryInfo("Extracted Images")
                    If Not imgDir.Exists Then
                        imgDir.Create()
                    End If

                    Dim count As Integer = 1
                    For Each img As SKBitmap In imgCollection
                        Console.WriteLine(vbTab & " {0,4} x {1,4} px", img.Width, img.Height)
                        Dim imageFileName As String = Path.Combine(imgDir.FullName, String.Format($"pict{count}.jpg"))
                        Dim file As New FileStream(imageFileName, FileMode.Create)
                        img.Encode(file, SKEncodedImageFormat.Jpeg, 100)
                        count += 1
                    Next

                    ' Open the result for demonstration purposes.
                    File.WriteAllText(htmlFile, htmlString)
                    System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(htmlFile) With {.UseShellExecute = True})
                    System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(imgDir.FullName) With {.UseShellExecute = True})
                End If
            End If
        End Sub
    End Class
End Namespace

Download

If you need a new code example or have a question: email us at support@sautinsoft.com or ask at Online Chat (right-bottom corner of this page) or use the Form below:

Name(optional):

Email:

Message:

Questions and suggestions from you are always welcome!

We are developing .Net components since 2002. We know PDF, DOCX, RTF, HTML, XLSX and Images formats. If you need any assistance with creating, modifying or converting documents in various formats, we can help you. We will write any code example for you absolutely free.

Convert PDF to HTML in memory and get List with all images using C# and .NET

The captcha is incorrect or the Email field is not filled in, please try again.

Questions and suggestions from you are always welcome!