Convert PDF to HTML in memory and get List with all images using C# and .NET
Complete code
using System;
using System.IO;
using System.Collections.Generic;
using System.Drawing;
namespace Sample
{
class Sample
{
static void Main(string[] args)
{
// Before starting, we recommend to get a free 100-day key:
// https://sautinsoft.com/start-for-free/
// Apply the key here:
// SautinSoft.PdfFocus.SetLicense("...");
ConvertPdfBytesToHtml();
}
private static void ConvertPdfBytesToHtml()
{
// We need files only for demonstration purposes.
// The whole conversion process will be done in memory.
string pdfFile = Path.GetFullPath(@"..\..\..\simple text.pdf");
string htmlFile = "Result.htm";
// This is the list with extracted images.
// It will be filled by images after the conversion.
List<Image> imgCollection = new List<Image>();
// Convert PDF to HTML in memory
SautinSoft.PdfFocus f = new SautinSoft.PdfFocus();
// Let's force the component to store images inside HTML document
// using base-64 encoding.
// Thus the component will not use HDD.
f.HtmlOptions.IncludeImageInHtml = true;
f.HtmlOptions.Title = "Simple text";
// Read a PDF document to byte array.
// Assume that we already have the PDF as array of bytes.
byte[] pdf = File.ReadAllBytes(pdfFile);
f.OpenPdf(pdf);
if (f.PageCount > 0)
{
// Convert PDF to HTML in memory
string htmlString = f.ToHtml(1, f.PageCount, imgCollection);
// Save HTML to a file only for the demonstration purpose.
if (htmlString != null)
{
// Show info about images and save them
Console.WriteLine("After converting we've got {0} image(s):", imgCollection.Count);
DirectoryInfo imgDir = new DirectoryInfo("Extracted Images");
if (!imgDir.Exists)
imgDir.Create();
int count = 1;
foreach (Image img in imgCollection)
{
Console.WriteLine("\t {0,4} x {1,4} px", img.Width, img.Height);
string imageFileName = Path.Combine(imgDir.FullName, String.Format($"pict{count}.jpg"));
img.Save(imageFileName, System.Drawing.Imaging.ImageFormat.Jpeg);
count++;
}
// Open the result for demonstration purposes.
File.WriteAllText(htmlFile, htmlString);
System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(htmlFile) { UseShellExecute = true });
System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(imgDir.FullName) { UseShellExecute = true });
}
}
}
}
}
Imports Microsoft.VisualBasic
Imports System
Imports System.IO
Imports System.Collections.Generic
Imports System.Drawing
Namespace Sample
Friend Class Sample
Shared Sub Main(ByVal args() As String)
' Before starting, we recommend to get a free 100-day key:
' https://sautinsoft.com/start-for-free/
' Apply the key here
' SautinSoft.PdfFocus.SetLicense("...");
ConvertPdfBytesToHtml()
End Sub
Private Shared Sub ConvertPdfBytesToHtml()
' We need files only for demonstration purposes.
' The whole conversion process will be done in memory.
Dim pdfFile As String = Path.GetFullPath("..\..\..\simple text.pdf")
Dim htmlFile As String = "Result.htm"
' This is the list with extracted images.
' It will be filled by images after the conversion.
Dim imgCollection As New List(Of Image)()
' Convert PDF to HTML in memory
Dim f As New SautinSoft.PdfFocus()
' Let's force the component to store images inside HTML document
' using base-64 encoding.
' Thus the component will not use HDD.
f.HtmlOptions.IncludeImageInHtml = True
f.HtmlOptions.Title = "Simple text"
' Read a PDF document to byte array.
' Assume that we already have the PDF as array of bytes.
Dim pdf() As Byte = File.ReadAllBytes(pdfFile)
f.OpenPdf(pdf)
If f.PageCount > 0 Then
' Convert PDF to HTML in memory
Dim htmlString As String = f.ToHtml(1, f.PageCount, imgCollection)
' Save HTML to a file only for the demonstration purpose.
If htmlString IsNot Nothing Then
' Show info about images and save them
Console.WriteLine("After converting we've got {0} image(s):", imgCollection.Count)
Dim imgDir As New DirectoryInfo("Extracted Images")
If Not imgDir.Exists Then
imgDir.Create()
End If
Dim count As Integer = 1
For Each img As Image In imgCollection
Console.WriteLine(vbTab & " {0,4} x {1,4} px", img.Width, img.Height)
Dim imageFileName As String = Path.Combine(imgDir.FullName, String.Format($"pict{count}.jpg"))
img.Save(imageFileName, System.Drawing.Imaging.ImageFormat.Jpeg)
count += 1
Next img
' Open the result for demonstration purposes.
File.WriteAllText(htmlFile, htmlString)
System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(htmlFile) With {.UseShellExecute = True})
System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(imgDir.FullName) With {.UseShellExecute = True})
End If
End If
End Sub
End Class
End Namespace
If you need a new code example or have a question: email us at support@sautinsoft.com or ask at Online Chat (right-bottom corner of this page) or use the Form below: