Create Searchable PDF from Scanned Image in C# and .NET

PDF Content Groups allow you to organize the content elements of a PDF document in such a way that they can be transformed and/or cropped together without affecting other parts of the document.

In this article, we will look at how to create and use PDF content groups using the SautinSoft.Pdf library .Net in C# and .NET.

  1. Add SautinSoft.PDF from NuGet.
  2. Load an image.
  3. Apply OCR for text recognition.
  4. Save the document in PDF format.

Input file: simple text.png

Output result:

Complete code

using Net.Pkcs11Interop.HighLevelAPI.MechanismParams;
using SautinSoft.Pdf;
using SautinSoft.Pdf.Content;
using SautinSoft.Pdf.Objects;
using System;
using Tesseract;
using System.Collections.Generic;
using System.IO;
using System.Linq;

namespace OCR
{
    class OCR
    {
        /// <summary>
        /// Convert scanned Image to searchable PDF
        /// </summary>
        /// <remarks>
        /// Details: https://sautinsoft.com/products/pdf/help/net/developer-guide/convert-scanned-image-to-searchable-pdf.php
        /// </remarks>
        static void Main()
        {
            try
            {
                string tesseractLanguages = "eng";
                string tesseractData = Path.GetFullPath(@".\tessdata");
                string tempFile = Path.Combine(tesseractData, Path.GetRandomFileName());
                PdfDocument pdfDocument = new PdfDocument();
                var pdfPage = pdfDocument.Pages.Add();
                PdfFormattedText text = new PdfFormattedText();

                using (Tesseract.TesseractEngine engine = new Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default))
                {

                    engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto;
                    using (MemoryStream ms = new MemoryStream())
                    {
                        (new FileStream(@"..\..\..\Potato Beetle.png", FileMode.Open)).CopyTo(ms);

                        byte[] imgBytes = ms.ToArray();
                        using (Tesseract.Pix img = Tesseract.Pix.LoadFromMemory(imgBytes))
                        {
                            using (var page = engine.Process(img, "Serachablepdf"))
                            {
                                var st = page.GetText();
                                double scale = Math.Min(pdfPage.Size.Width / page.RegionOfInterest.Width, pdfPage.Size.Height / page.RegionOfInterest.Height);

                                using (var iter = page.GetIterator())
                                {
                                    iter.Begin();

                                    do
                                    {
                                        do
                                        {
                                            do
                                            {
                                                    iter.TryGetBoundingBox(PageIteratorLevel.TextLine, out Rect liRect);
                                                    text.FontSize = liRect.Height * scale;
                                                    //text.Opacity = 0;
                                                    text.Append(iter.GetText(PageIteratorLevel.TextLine));
                                                    pdfPage.Content.DrawText(text, new PdfPoint(liRect.X1 * scale, pdfPage.Size.Height - liRect.Y1 * scale - text.Height));
                                                    text.Clear();
                                            } while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
                                        } while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para));
                                    } while (iter.Next(PageIteratorLevel.Block));
                                }
                            }
                        }
                    }
                }
                pdfDocument.Save(@"Result.pdf");
                System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo("Result.pdf") { UseShellExecute = true });
            }
            catch (Exception e)
            {
                Console.WriteLine();
                Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder \"tessdata\"");
                Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast");
                Console.ReadKey();
                throw new Exception("Error Tesseract: " + e.Message);
            }
            finally
            {

            }
        }
    }
}

Download

Option Infer On

Imports Net.Pkcs11Interop.HighLevelAPI.MechanismParams
Imports SautinSoft.Pdf
Imports SautinSoft.Pdf.Content
Imports SautinSoft.Pdf.Objects
Imports System
Imports Tesseract
Imports System.Collections.Generic
Imports System.IO
Imports System.Linq

Namespace OCR
	Friend Class OCR
		''' <summary>
		''' Convert scanned Image to searchable PDF
		''' </summary>
		''' <remarks>
		''' Details: https://sautinsoft.com/products/pdf/help/net/developer-guide/convert-scanned-image-to-searchable-pdf.php
		''' </remarks>
		Shared Sub Main()
			Try
				Dim tesseractLanguages As String = "eng"
				Dim tesseractData As String = Path.GetFullPath(".\tessdata")
				Dim tempFile As String = Path.Combine(tesseractData, Path.GetRandomFileName())
				Dim pdfDocument As New PdfDocument()
				Dim pdfPage = pdfDocument.Pages.Add()
				Dim text As New PdfFormattedText()

				Using engine As New Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default)

					engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto
					Using ms As New MemoryStream()
						Call (New FileStream("..\..\..\Potato Beetle.png", FileMode.Open)).CopyTo(ms)

						Dim imgBytes() As Byte = ms.ToArray()
						Using img As Tesseract.Pix = Tesseract.Pix.LoadFromMemory(imgBytes)
							Using page = engine.Process(img, "Serachablepdf")
								Dim st = page.GetText()
								Dim scale As Double = Math.Min(pdfPage.Size.Width / page.RegionOfInterest.Width, pdfPage.Size.Height / page.RegionOfInterest.Height)

								Using iter = page.GetIterator()
									iter.Begin()

									Do
										Do
											Do
													Dim liRect As Rect
													iter.TryGetBoundingBox(PageIteratorLevel.TextLine, liRect)
													text.FontSize = liRect.Height * scale
													'text.Opacity = 0;
													text.Append(iter.GetText(PageIteratorLevel.TextLine))
													pdfPage.Content.DrawText(text, New PdfPoint(liRect.X1 * scale, pdfPage.Size.Height - liRect.Y1 * scale - text.Height))
													text.Clear()
											Loop While iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine)
										Loop While iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para)
									Loop While iter.Next(PageIteratorLevel.Block)
								End Using
							End Using
						End Using
					End Using
				End Using
				pdfDocument.Save("Result.pdf")
				System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo("Result.pdf") With {.UseShellExecute = True})
			Catch e As Exception
				Console.WriteLine()
				Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder ""tessdata""")
				Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast")
				Console.ReadKey()
				Throw New Exception("Error Tesseract: " & e.Message)
			Finally

			End Try
		End Sub
	End Class
End Namespace

Download


If you need a new code example or have a question: email us at support@sautinsoft.com or ask at Online Chat (right-bottom corner of this page) or use the Form below:



Questions and suggestions from you are always welcome!

We are developing .Net components since 2002. We know PDF, DOCX, RTF, HTML, XLSX and Images formats. If you need any assistance with creating, modifying or converting documents in various formats, we can help you. We will write any code example for you absolutely free.