Convert multiple PDF files to HTML in C# and .NET


Complete code

using System;
using System.IO;
using System.Linq;
using System.Text;
using SautinSoft;

namespace Sample
{
    class Sample
    {
        static void Main(string[] args)
        {
            ConvertMultiplePdfToHtmls();
            //ConvertMultiplePdfToSingleHtml();
        }

        /// <summary>
        /// Converts multiple PDF files to HTML files.
        /// </summary>
        static void ConvertMultiplePdfToHtmls()
        {
            // Directory with *.pdf files.
            string pdfDirectory = Path.GetFullPath(@"..\..\..\");
            string[] pdfFiles = Directory.GetFiles(pdfDirectory, "*.pdf");
            DirectoryInfo htmlDirectory = new DirectoryInfo(@"htmls");
            if (!htmlDirectory.Exists)
                htmlDirectory.Create();
                                  // Get your free 30-day key here:   
			 // https://sautinsoft.com/start-for-free/
            
			PdfFocus f = new PdfFocus();
            
            int success = 0;
            int total = 0;

            foreach (string pdfFile in pdfFiles)
            {
                Console.WriteLine("Converting {0} ...", Path.GetFileName(pdfFile));

                f.OpenPdf(pdfFile);
                total++;

                if (f.PageCount > 0)
                {
                    // Path (must exist) to a directory to store images after converting. Notice also to the property "ImageSubFolder".
                    f.HtmlOptions.ImageFolder = htmlDirectory.FullName;

                    // A folder (will be created by the component) without any drive letters, only the folder as "myfolder".
                    f.HtmlOptions.ImageSubFolder = String.Format("{0}_images", Path.GetFileNameWithoutExtension(pdfFile));

                    // A template name for images
                    f.HtmlOptions.ImageFileName = "picture";

                    // Auto - the same image format as in the source PDF;
                    // 'Jpeg' to make the document size less; 
                    // 'PNG' to keep the highest quality, but the highest size too.
                    f.EmbeddedImagesFormat = PdfFocus.eImageFormat.Auto;

                    // How to store images: Inside HTML document as base64 images or as linked separate image files.
                    f.HtmlOptions.IncludeImageInHtml = false;

                    string htmlFile = Path.GetFileNameWithoutExtension(pdfFile) + ".html";
                    string htmlFilePath = Path.Combine(htmlDirectory.FullName, htmlFile);

                    if (f.ToHtml(htmlFilePath) == 0)
                    {
                        success++;
                    }
                }
            }
            // Show results:
            Console.WriteLine("{0} of {1} files converted successfully!", success, total);

            // Open folder with HTML files after converting.
            // Open the result for demonstration purposes.
            System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(htmlDirectory.FullName) { UseShellExecute = true });
        }
        /// <summary>
        /// Converts multiple PDF files into a single HTML document.
        /// </summary>
        static void ConvertMultiplePdfToSingleHtml()
        {
            // Directory with *.pdf files.
            string pdfDirectory = Path.GetFullPath(@"..\..\..\");
            string htmlFile = "Result.html";

            string[] pdfFiles = Directory.GetFiles(pdfDirectory, "*.pdf");

            // Here we'll keep our Html document.
            StringBuilder singleHtml = new StringBuilder();
            singleHtml.Append("<html>\r\n<head>\r\n");
            singleHtml.Append(@"<meta http-equiv = ""Content-Type"" content=""text/html; charset=utf-8"" />");
            singleHtml.Append("\r\n</head>\r\n<body>");

            PdfFocus f = new PdfFocus();
            //f.Serial = "XXXXXXXXXXX";

            int success = 0;
            int total = 0;

            foreach (string pdfFile in pdfFiles)
            {
                Console.WriteLine("Converting {0} ...", Path.GetFileName(pdfFile));

                f.OpenPdf(pdfFile);
                total++;

                if (f.PageCount > 0)
                {
                    // How to store images: Inside HTML document as base64 images or as linked separate image files.
                    f.HtmlOptions.IncludeImageInHtml = false;

                    // Create own subfolder for each converted file to store images separately and don't mix up them.
                    f.HtmlOptions.ImageSubFolder = String.Format("{0}_images", Path.GetFileNameWithoutExtension(pdfFile));

                    // A template name for images
                    f.HtmlOptions.ImageFileName = "picture";

                    // Auto - the same image format as in the source PDF;
                    // 'Jpeg' to make the document size less; 
                    // 'PNG' to keep the highest quality, but the highest size too.
                    f.EmbeddedImagesFormat = PdfFocus.eImageFormat.Auto;

                    // Let's make our CSS inline to be able merge HTML documents without any problems.
                    f.HtmlOptions.InlineCSS = true;

                    // We need only contents of <body>...</body>.
                    f.HtmlOptions.ProduceOnlyHtmlBody = true;

                    string tempHtml = f.ToHtml();

                    if (!String.IsNullOrEmpty(tempHtml))
                    {
                        success++;
                        // Add tempHtml into a single HTML.
                        singleHtml.Append(tempHtml);
                    }
                }
            }
            singleHtml.Append("</body></html>");

            // Show results:
            File.WriteAllText(htmlFile, singleHtml.ToString());

            Console.WriteLine("{0} of {1} files converted and merged into {2}!", success, total, Path.GetFileName(htmlFile));

            // Open the result for demonstration purposes.
            System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(htmlFile) { UseShellExecute = true });
        }
    }
}

Download

Imports Microsoft.VisualBasic
Imports System
Imports System.IO
Imports System.Linq
Imports System.Text
Imports SautinSoft

Namespace Sample
    Friend Class Sample
        Shared Sub Main(ByVal args() As String)
            'ConvertMultiplePdfToHtmls()
            ConvertMultiplePdfToSingleHtml()
        End Sub

        ''' <summary>
        ''' Converts multiple PDF files to HTML files.
        ''' </summary>
        Private Shared Sub ConvertMultiplePdfToHtmls()
            ' Directory with *.pdf files.
            Dim pdfDirectory As String = Path.GetFullPath("..\..\..\")
            Dim pdfFiles() As String = Directory.GetFiles(pdfDirectory, "*.pdf")
            Dim htmlDirectory As New DirectoryInfo("htmls")
            If Not htmlDirectory.Exists Then
                htmlDirectory.Create()
            End If
	                                ' Get your free 30-day key here: 
	                                ' https://sautinsoft.com/start-for-free/
		
            Dim f As New PdfFocus()
            ' After purchasing the license, please insert your serial number here to activate the component:
            'f.Serial = "XXXXXXXXXXX"

            Dim success As Integer = 0
            Dim total As Integer = 0

            For Each pdfFile As String In pdfFiles
                Console.WriteLine("Converting {0} ...", Path.GetFileName(pdfFile))

                f.OpenPdf(pdfFile)
                total += 1

                If f.PageCount > 0 Then
                    ' Path (must exist) to a directory to store images after converting. Notice also to the property "ImageSubFolder".
                    f.HtmlOptions.ImageFolder = htmlDirectory.FullName

                    ' A folder (will be created by the component) without any drive letters, only the folder as "myfolder".
                    f.HtmlOptions.ImageSubFolder = String.Format("{0}_images", Path.GetFileNameWithoutExtension(pdfFile))

                    ' A template name for images
                    f.HtmlOptions.ImageFileName = "picture"

                    ' Auto - the same image format as in the source PDF;
                    ' 'Jpeg' to make the document size less; 
                    ' 'PNG' to keep the highest quality, but the highest size too.
                    f.EmbeddedImagesFormat = PdfFocus.eImageFormat.Auto

                    ' How to store images: Inside HTML document as base64 images or as linked separate image files.
                    f.HtmlOptions.IncludeImageInHtml = False

                    Dim htmlFile As String = Path.GetFileNameWithoutExtension(pdfFile) & ".html"
                    Dim htmlFilePath As String = Path.Combine(htmlDirectory.FullName, htmlFile)

                    If f.ToHtml(htmlFilePath) = 0 Then
                        success += 1
                    End If
                End If
            Next pdfFile
            ' Show results:
            Console.WriteLine("{0} of {1} files converted successfully!", success, total)

            ' Open folder with HTML files after converting.
            ' Open the result for demonstration purposes.
            System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(htmlDirectory.FullName) With {.UseShellExecute = True})
        End Sub
        ''' <summary>
        ''' Converts multiple PDF files into a single HTML document.
        ''' </summary>
        Private Shared Sub ConvertMultiplePdfToSingleHtml()
            ' Directory with *.pdf files.
            Dim pdfDirectory As String = Path.GetFullPath("..\")
            Dim htmlFile As String = "Result.html"

            Dim pdfFiles() As String = Directory.GetFiles(pdfDirectory, "*.pdf")

            ' Here we'll keep our Html document.
            Dim singleHtml As New StringBuilder()
            singleHtml.Append("<html>" & vbCrLf & "<head>" & vbCrLf)
            singleHtml.Append("<meta http-equiv = ""Content-Type"" content=""text/html; charset=utf-8"" />")
            singleHtml.Append(vbCrLf & "</head>" & vbCrLf & "<body>")
	                                ' Get your free 30-day key here: 
	                                ' https://sautinsoft.com/start-for-free/
		
            Dim f As New PdfFocus()

            Dim success As Integer = 0
            Dim total As Integer = 0

            For Each pdfFile As String In pdfFiles
                Console.WriteLine("Converting {0} ...", Path.GetFileName(pdfFile))

                f.OpenPdf(pdfFile)
                total += 1

                If f.PageCount > 0 Then
                    ' How to store images: Inside HTML document as base64 images or as linked separate image files.
                    f.HtmlOptions.IncludeImageInHtml = False

                    ' Create own subfolder for each converted file to store images separately and don't mix up them.
                    f.HtmlOptions.ImageSubFolder = String.Format("{0}_images", Path.GetFileNameWithoutExtension(pdfFile))

                    ' A template name for images
                    f.HtmlOptions.ImageFileName = "picture"

                    ' Auto - the same image format as in the source PDF;
                    ' 'Jpeg' to make the document size less; 
                    ' 'PNG' to keep the highest quality, but the highest size too.
                    f.EmbeddedImagesFormat = PdfFocus.eImageFormat.Auto

                    ' Let's make our CSS inline to be able merge HTML documents without any problems.
                    f.HtmlOptions.InlineCSS = True

                    ' We need only contents of <body>...</body>.
                    f.HtmlOptions.ProduceOnlyHtmlBody = True

                    Dim tempHtml As String = f.ToHtml()

                    If Not String.IsNullOrEmpty(tempHtml) Then
                        success += 1
                        ' Add tempHtml into a single HTML.
                        singleHtml.Append(tempHtml)
                    End If
                End If
            Next pdfFile
            singleHtml.Append("</body></html>")

            ' Show results:
            File.WriteAllText(htmlFile, singleHtml.ToString())

            Console.WriteLine("{0} of {1} files converted and merged into {2}!", success, total, Path.GetFileName(htmlFile))

            ' Open the result for demonstration purposes.
            System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(htmlFile) With {.UseShellExecute = True})
        End Sub
    End Class
End Namespace

Download


If you need a new code example or have a question: email us at support@sautinsoft.com or ask at Online Chat (right-bottom corner of this page) or use the Form below:



Questions and suggestions from you are always welcome!

We are developing .Net components since 2002. We know PDF, DOCX, RTF, HTML, XLSX and Images formats. If you need any assistance with creating, modifying or converting documents in various formats, we can help you. We will write any code example for you absolutely free.