Use OCR to Obtain Text from Scanned PDF in C# and .NET

Optical Character Recognition (OCR) is a powerful technology that allows you to convert various types of documents, such as scanned paper documents, PDF files, or images captured by a digital camera into editable and searchable data.

In this article, we will look at how to perform OCR and extract text from scanned PDF documents using C# and .NET using the SautinSoft.Pdf .NET library.

Step-by-step guide:

  1. Add SautinSoft.PDF from NuGet.
  2. Load a scanned PDF document.
  3. Perform OCR on the PDF document.
  4. Show the extracted text to the Console.

Input file: simple text.pdf

Output result:

Complete code

using SautinSoft.Pdf;
using SautinSoft.Pdf.Content;
using SautinSoft.Pdf.Objects;
using System;
using Tesseract;
using System.Collections.Generic;
using System.IO;
using System.Linq;

namespace OCR
{
    class OCR
    {
        /// <summary>
        /// Perform OCR and extract Text from scanned PDF
        /// </summary>
        /// <remarks>
        /// Details: https://sautinsoft.com/products/pdf/help/net/developer-guide/perform-ocr-and-extract-text-from-scanned-pdf.php
        /// </remarks>
        static void Main()
        {
            try
            {
                string tesseractLanguages = "eng";
                string tesseractData = Path.GetFullPath(@".\tessdata");
                string tempFile = Path.Combine(tesseractData, Path.GetRandomFileName());
                PdfDocument pdfDocument = PdfDocument.Load(@"..\..\..\Scanned.pdf");

                using (Tesseract.TesseractEngine engine = new Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default))
                {
                    int k = 1;
                    foreach (var pdfPage in pdfDocument.Pages)
                    {
                        Console.WriteLine("<Page " + (k++) + ">");
                        var collection = pdfPage.Content.Elements.All().OfType<PdfImageContent>().ToList();
                        for (int i = 0; i < collection.Count(); i++)
                        {
                            engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto;
                            using (MemoryStream ms = new MemoryStream())
                            {
                                collection[i].Save(ms, new ImageSaveOptions());

                                byte[] imgBytes = ms.ToArray();
                                using (Tesseract.Pix img = Tesseract.Pix.LoadFromMemory(imgBytes))
                                {
                                    using (var page = engine.Process(img, "Serachablepdf"))
                                    {
                                        var st = page.GetText();
                                        double scale = Math.Min(collection[i].Bounds.Width / page.RegionOfInterest.Width, collection[i].Bounds.Height / page.RegionOfInterest.Height);

                                        using (var iter = page.GetIterator())
                                        {
                                            iter.Begin();

                                            do
                                            {
                                                do
                                                {
                                                    do
                                                    {
                                                        do
                                                        {
                                                            Console.Write(iter.GetText(PageIteratorLevel.Word));
                                                            Console.Write(' ');

                                                            if (iter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word))
                                                            {
                                                                Console.WriteLine();
                                                            }
                                                        } while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word));
                                                    } while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
                                                } while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para));
                                            } while (iter.Next(PageIteratorLevel.Block));
                                        }
                                    }
                                }
                            }
                        }
                    }
                }
            }
            catch (Exception e)
            {
                Console.WriteLine();
                Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder \"tessdata\"");
                Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast");
                Console.ReadKey();
                throw new Exception("Error Tesseract: " + e.Message);
            }
            finally
            {

            }
        }
    }
}

Download

Option Infer On

Imports SautinSoft.Pdf
Imports SautinSoft.Pdf.Content
Imports SautinSoft.Pdf.Objects
Imports System
Imports Tesseract
Imports System.Collections.Generic
Imports System.IO
Imports System.Linq

Namespace OCR
	Friend Class OCR
		''' <summary>
		''' Perform OCR and extract Text from scanned PDF
		''' </summary>
		''' <remarks>
		''' Details: https://sautinsoft.com/products/pdf/help/net/developer-guide/perform-ocr-and-extract-text-from-scanned-pdf.php
		''' </remarks>
		Shared Sub Main()
			Try
				Dim tesseractLanguages As String = "eng"
				Dim tesseractData As String = Path.GetFullPath(".\tessdata")
				Dim tempFile As String = Path.Combine(tesseractData, Path.GetRandomFileName())
				Dim pdfDocument As PdfDocument = PdfDocument.Load("..\..\..\Scanned.pdf")

				Using engine As New Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default)
					Dim k As Integer = 1
					For Each pdfPage In pdfDocument.Pages
' INSTANT VB WARNING: An assignment within expression was extracted from the following statement:
' ORIGINAL LINE: Console.WriteLine("<Page " + (k++) + ">");
						Console.WriteLine("<Page " & (k) & ">")
						k += 1
						Dim collection = pdfPage.Content.Elements.All().OfType(Of PdfImageContent)().ToList()
						For i As Integer = 0 To collection.Count() - 1
							engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto
							Using ms As New MemoryStream()
								collection(i).Save(ms, New ImageSaveOptions())

								Dim imgBytes() As Byte = ms.ToArray()
								Using img As Tesseract.Pix = Tesseract.Pix.LoadFromMemory(imgBytes)
									Using page = engine.Process(img, "Serachablepdf")
										Dim st = page.GetText()
										Dim scale As Double = Math.Min(collection(i).Bounds.Width \ page.RegionOfInterest.Width, collection(i).Bounds.Height \ page.RegionOfInterest.Height)

										Using iter = page.GetIterator()
											iter.Begin()

											Do
												Do
													Do
														Do
															Console.Write(iter.GetText(PageIteratorLevel.Word))
															Console.Write(" "c)

															If iter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word) Then
																Console.WriteLine()
															End If
														Loop While iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word)
													Loop While iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine)
												Loop While iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para)
											Loop While iter.Next(PageIteratorLevel.Block)
										End Using
									End Using
								End Using
							End Using
						Next i
					Next pdfPage
				End Using
			Catch e As Exception
				Console.WriteLine()
				Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder ""tessdata""")
				Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast")
				Console.ReadKey()
				Throw New Exception("Error Tesseract: " & e.Message)
			Finally

			End Try
		End Sub
	End Class
End Namespace

Download


If you need a new code example or have a question: email us at support@sautinsoft.com or ask at Online Chat (right-bottom corner of this page) or use the Form below:



Questions and suggestions from you are always welcome!

We are developing .Net components since 2002. We know PDF, DOCX, RTF, HTML, XLSX and Images formats. If you need any assistance with creating, modifying or converting documents in various formats, we can help you. We will write any code example for you absolutely free.