Apply OCR to Vector Graphic PDF in C# and .NET

OCR (Optical Character Recognition) is a technology that allows you to convert various types of documents, such as scanned paper documents, PDF files or images captured by a camera, into editable and searchable data.

In this article, we will look at how to use the SautinSoft.Pdf library.NET to perform OCR on a PDF document containing text in the form of vector graphics using C# and .NET.

Step-by-step guide:

  1. Add SautinSoft.PDF from NuGet.
  2. Load a PDF document.
  3. Save the vectorized content as image objects.
  4. Perform OCR.
  5. Save the document in DOCX format.

Input file: simple text.pdf

OCR a PDF document containing text as vector graphics input

Output result:

OCR a PDF document containing text as vector graphics output

Complete code

using SautinSoft.Pdf;
using SautinSoft.Pdf.Content;
using SautinSoft.Pdf.Objects;
using System;
using Tesseract;
using System.Collections.Generic;
using System.IO;
using System.Linq;

namespace OCR
{
    class OCR
    {
        /// <summary>
        /// OCR a PDF document containing text as vector graphics
        /// </summary>
        /// <remarks>
        /// Details: https://sautinsoft.com/products/pdf/help/net/developer-guide/ocr-a-pdf-document-containing-text-as-vector-graphics.php
        /// </remarks>
        static void Main()
        {
            try
            {
                string tesseractLanguages = "eng";
                string tesseractData = Path.GetFullPath(@".\tessdata");
                string tempFile = Path.Combine(tesseractData, Path.GetRandomFileName());
                PdfDocument pdfDocument = PdfDocument.Load(@"..\..\..\Vectorized text.pdf");
                MemoryStream ms = new MemoryStream();
                pdfDocument.Save(ms, new ImageSaveOptions());
                pdfDocument = new PdfDocument();
                PdfFormattedText text = new PdfFormattedText();

                using (Tesseract.TesseractEngine engine = new Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default))
                {
                    var pdfPage = pdfDocument.Pages.Add();
                    engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto;
                    {
                        byte[] imgBytes = ms.ToArray();
                        using (Tesseract.Pix img = Tesseract.Pix.LoadFromMemory(imgBytes))
                        {
                            using (var page = engine.Process(img, "Serachablepdf"))
                            {
                                var st = page.GetText();
                                double scale = Math.Min(pdfPage.Size.Width / page.RegionOfInterest.Width, pdfPage.Size.Height / page.RegionOfInterest.Height);

                                using (var iter = page.GetIterator())
                                {
                                    iter.Begin();

                                    do
                                    {
                                        do
                                        {
                                            do
                                            {
                                                iter.TryGetBoundingBox(PageIteratorLevel.TextLine, out Rect liRect);
                                                text.FontSize = liRect.Height * scale;
                                                //text.Opacity = 0;
                                                text.Append(iter.GetText(PageIteratorLevel.TextLine));
                                                pdfPage.Content.DrawText(text, new PdfPoint(liRect.X1 * scale, pdfPage.Size.Height - liRect.Y1 * scale - text.Height));
                                                text.Clear();
                                            } while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
                                        } while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para));
                                    } while (iter.Next(PageIteratorLevel.Block));
                                }
                            }
                        }
                    }
                }
                pdfDocument.Save(@"text.docx");
                System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(@"text.docx") { UseShellExecute = true });
            }
            catch (Exception e)
            {
                Console.WriteLine();
                Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder \"tessdata\"");
                Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast");
                Console.ReadKey();
                throw new Exception("Error Tesseract: " + e.Message);
            }
            finally
            {

            }
        }
    }
}

Download

Imports SautinSoft.Pdf
Imports SautinSoft.Pdf.Content
Imports System
Imports Tesseract
Imports System.Collections.Generic
Imports System.IO
Imports System.Linq

Namespace OCR
    Friend Class OCR
        ''' <summary>
        ''' OCR a PDF document containing text as vector graphics
        ''' </summary>
        ''' <remarks>
        ''' Details: https://sautinsoft.com/products/pdf/help/net/developer-guide/ocr-a-pdf-document-containing-text-as-vector-graphics.php
        ''' </remarks>
        Public Shared Sub Main()
            Dim liRect As Rect = Nothing
            Try
                Dim tesseractLanguages = "eng"
                Dim tesseractData = Path.GetFullPath(".\tessdata")
                Dim tempFile As String = Path.Combine(tesseractData, Path.GetRandomFileName())
                Dim pdfDocument As PdfDocument = PdfDocument.Load("..\..\..\Vectorized text.pdf")
                Dim mss As List(Of MemoryStream) = New List(Of MemoryStream)()
                Dim ms As MemoryStream = New MemoryStream()
                pdfDocument.Save(ms, New ImageSaveOptions())
                pdfDocument = New PdfDocument()
                Dim text As PdfFormattedText = New PdfFormattedText()

                Using engine As TesseractEngine = New TesseractEngine(tesseractData, tesseractLanguages, EngineMode.Default)
                    Dim pdfPage = pdfDocument.Pages.Add()
                    engine.DefaultPageSegMode = PageSegMode.Auto
                    If True Then
                        Dim imgBytes As Byte() = ms.ToArray()
                        Using img = Pix.LoadFromMemory(imgBytes)
                            Using page = engine.Process(img, "Serachablepdf")
                                Dim st = page.GetText()
                                Dim scale = Math.Min(pdfPage.Size.Width / page.RegionOfInterest.Width, pdfPage.Size.Height / page.RegionOfInterest.Height)

                                Using iter = page.GetIterator()
                                    iter.Begin()

                                    Do
                                        Do
                                            Do
                                                iter.TryGetBoundingBox(PageIteratorLevel.TextLine, liRect)
                                                text.FontSize = liRect.Height * scale
                                                'text.Opacity = 0;
                                                text.Append(iter.GetText(PageIteratorLevel.TextLine))
                                                pdfPage.Content.DrawText(text, New PdfPoint(liRect.X1 * scale, pdfPage.Size.Height - liRect.Y1 * scale - text.Height))
                                                text.Clear()
                                            Loop While iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine)
                                        Loop While iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para)
                                    Loop While iter.Next(PageIteratorLevel.Block)
                                End Using
                            End Using
                        End Using
                    End If
                End Using
                pdfDocument.Save("text.docx")
                Process.Start(New ProcessStartInfo("text.docx") With {
                    .UseShellExecute = True
                })
            Catch e As Exception
                Console.WriteLine()
                Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder ""tessdata""")
                Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast")
                Console.ReadKey()
                Throw New Exception("Error Tesseract: " & e.Message)
            Finally

            End Try
        End Sub
    End Class
End Namespace

Download


If you need a new code example or have a question: email us at support@sautinsoft.com or ask at Online Chat (right-bottom corner of this page) or use the Form below:


Captcha

Questions and suggestions from you are always welcome!

We are developing .Net components since 2002. We know PDF, DOCX, RTF, HTML, XLSX and Images formats. If you need any assistance with creating, modifying or converting documents in various formats, we can help you. We will write any code example for you absolutely free.