Execute OCR on PDF with Vector Text in C# and .NET

OCR (Optical Character Recognition) is a technology that allows you to convert various types of documents, such as scanned paper documents, PDF files or images captured by a camera, into editable and searchable data.

In this article, we will look at how to use the SautinSoft.Pdf library.NET to perform OCR on a PDF document containing text in the form of vector graphics using C# and .NET.

Step-by-step guide:

  1. Add SautinSoft.PDF from NuGet.
  2. Load a PDF document.
  3. Save the vectorized content as image objects.
  4. Perform OCR.
  5. Save the document in DOCX format.

Input file: simple text.pdf

Output result:

Complete code

using SautinSoft.Pdf;
using SautinSoft.Pdf.Content;
using SautinSoft.Pdf.Objects;
using System;
using Tesseract;
using System.Collections.Generic;
using System.IO;
using System.Linq;

namespace OCR
    class OCR
        /// <summary>
        /// OCR a PDF document containing text as vector graphics
        /// </summary>
        /// <remarks>
        /// Details: https://sautinsoft.com/products/pdf/help/net/developer-guide/ocr-a-pdf-document-containing-text-as-vector-graphics.php
        /// </remarks>
        static void Main()
                string tesseractLanguages = "eng";
                string tesseractData = Path.GetFullPath(@".\tessdata");
                string tempFile = Path.Combine(tesseractData, Path.GetRandomFileName());
                PdfDocument pdfDocument = PdfDocument.Load(@"..\..\..\Vectorized text.pdf");
                List<MemoryStream> mss = new List<MemoryStream>();
                for (int i = 0; i < 3; i++)
                    MemoryStream ms = new MemoryStream();
                    pdfDocument.Save(ms, new ImageSaveOptions() { PageIndex = i });
                pdfDocument = new PdfDocument();
                PdfFormattedText text = new PdfFormattedText();

                using (Tesseract.TesseractEngine engine = new Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default))
                    foreach (var ms in mss)
                        var pdfPage = pdfDocument.Pages.Add();
                        engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto;
                            byte[] imgBytes = ms.ToArray();
                            using (Tesseract.Pix img = Tesseract.Pix.LoadFromMemory(imgBytes))
                                using (var page = engine.Process(img, "Serachablepdf"))
                                    var st = page.GetText();
                                    double scale = Math.Min(pdfPage.Size.Width / page.RegionOfInterest.Width, pdfPage.Size.Height / page.RegionOfInterest.Height);

                                    using (var iter = page.GetIterator())

                                                    iter.TryGetBoundingBox(PageIteratorLevel.TextLine, out Rect liRect);
                                                    text.FontSize = liRect.Height * scale;
                                                    //text.Opacity = 0;
                                                    pdfPage.Content.DrawText(text, new PdfPoint(liRect.X1 * scale, pdfPage.Size.Height - liRect.Y1 * scale - text.Height));
                                                } while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
                                            } while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para));
                                        } while (iter.Next(PageIteratorLevel.Block));
                System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(@"text.docx") { UseShellExecute = true });
            catch (Exception e)
                Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder \"tessdata\"");
                Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast");
                throw new Exception("Error Tesseract: " + e.Message);



Option Infer On

Imports SautinSoft.Pdf
Imports SautinSoft.Pdf.Content
Imports SautinSoft.Pdf.Objects
Imports System
Imports Tesseract
Imports System.Collections.Generic
Imports System.IO
Imports System.Linq

Namespace OCR
	Friend Class OCR
		''' <summary>
		''' OCR a PDF document containing text as vector graphics
		''' </summary>
		''' <remarks>
		''' Details: https://sautinsoft.com/products/pdf/help/net/developer-guide/ocr-a-pdf-document-containing-text-as-vector-graphics.php
		''' </remarks>
		Shared Sub Main()
				Dim tesseractLanguages As String = "eng"
				Dim tesseractData As String = Path.GetFullPath(".\tessdata")
				Dim tempFile As String = Path.Combine(tesseractData, Path.GetRandomFileName())
				Dim pdfDocument As PdfDocument = PdfDocument.Load("..\..\..\Vectorized text.pdf")
				Dim mss As New List(Of MemoryStream)()
				For i As Integer = 0 To pdfDocument.Pages.Count - 1
					Dim ms As New MemoryStream()
					pdfDocument.Save(ms, New ImageSaveOptions() With {.PageIndex = i})
				Next i
				pdfDocument = New PdfDocument()
				Dim text As New PdfFormattedText()

				Using engine As New Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default)
					For Each ms In mss
						Dim pdfPage = pdfDocument.Pages.Add()
' INSTANT VB TASK: Local functions are not converted by Instant VB:
'						engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto
'						{
'							byte[] imgBytes = ms.ToArray();
'							using (Tesseract.Pix img = Tesseract.Pix.LoadFromMemory(imgBytes))
'							{
'								using (var page = engine.Process(img, "Serachablepdf"))
'								{
'									var st = page.GetText();
'									double scale = Math.Min(pdfPage.Size.Width / page.RegionOfInterest.Width, pdfPage.Size.Height / page.RegionOfInterest.Height);
'									using (var iter = page.GetIterator())
'									{
'										iter.Begin();
'										do
'										{
'											do
'											{
'												do
'												{
'													iter.TryGetBoundingBox(PageIteratorLevel.TextLine, out Rect liRect);
'													text.FontSize = liRect.Height * scale;
'													'text.Opacity = 0;
'													text.Append(iter.GetText(PageIteratorLevel.TextLine));
'													pdfPage.Content.DrawText(text, New PdfPoint(liRect.X1 * scale, pdfPage.Size.Height - liRect.Y1 * scale - text.Height));
'													text.Clear();
'												} while (iter.@Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
'											} while (iter.@Next(PageIteratorLevel.Block, PageIteratorLevel.Para));
'										} while (iter.@Next(PageIteratorLevel.Block));
'									}
'								}
'							}
'						}
					Next ms
				End Using
				System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo("text.docx") With {.UseShellExecute = True})
			Catch e As Exception
				Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder ""tessdata""")
				Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast")
				Throw New Exception("Error Tesseract: " & e.Message)

			End Try
		End Sub
	End Class
End Namespace


If you need a new code example or have a question: email us at support@sautinsoft.com or ask at Online Chat (right-bottom corner of this page) or use the Form below:

Questions and suggestions from you are always welcome!

We are developing .Net components since 2002. We know PDF, DOCX, RTF, HTML, XLSX and Images formats. If you need any assistance with creating, modifying or converting documents in various formats, we can help you. We will write any code example for you absolutely free.