Optical Character Recognition (OCR) is a powerful technology that allows you to convert various types of documents, such as scanned paper documents, PDF files, or images captured by a digital camera into editable and searchable data.
In this article, we will look at how to perform OCR and extract text from scanned PDF documents using C# and .NET using the SautinSoft.Pdf .NET library.
Step-by-step guide:
Input file: simple text.pdf
Complete code
using SautinSoft.Pdf;
using SautinSoft.Pdf.Content;
using SautinSoft.Pdf.Objects;
using System;
using Tesseract;
using System.Collections.Generic;
using System.IO;
using System.Linq;
namespace OCR
{
class OCR
{
/// <summary>
/// Perform OCR and extract Text from scanned PDF
/// </summary>
/// <remarks>
/// Details: https://sautinsoft.com/products/pdf/help/net/developer-guide/perform-ocr-and-extract-text-from-scanned-pdf.php
/// </remarks>
static void Main()
{
try
{
string tesseractLanguages = "eng";
string tesseractData = Path.GetFullPath(@".\tessdata");
string tempFile = Path.Combine(tesseractData, Path.GetRandomFileName());
PdfDocument pdfDocument = PdfDocument.Load(@"..\..\..\Scanned.pdf");
using (Tesseract.TesseractEngine engine = new Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default))
{
int k = 1;
foreach (var pdfPage in pdfDocument.Pages)
{
Console.WriteLine("<Page " + (k++) + ">");
var collection = pdfPage.Content.Elements.All().OfType<PdfImageContent>().ToList();
for (int i = 0; i < collection.Count(); i++)
{
engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto;
using (MemoryStream ms = new MemoryStream())
{
collection[i].Save(ms, new ImageSaveOptions());
byte[] imgBytes = ms.ToArray();
using (Tesseract.Pix img = Tesseract.Pix.LoadFromMemory(imgBytes))
{
using (var page = engine.Process(img, "Serachablepdf"))
{
var st = page.GetText();
double scale = Math.Min(collection[i].Bounds.Width / page.RegionOfInterest.Width, collection[i].Bounds.Height / page.RegionOfInterest.Height);
using (var iter = page.GetIterator())
{
iter.Begin();
do
{
do
{
do
{
do
{
Console.Write(iter.GetText(PageIteratorLevel.Word));
Console.Write(' ');
if (iter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word))
{
Console.WriteLine();
}
} while (iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word));
} while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
} while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para));
} while (iter.Next(PageIteratorLevel.Block));
}
}
}
}
}
}
}
}
catch (Exception e)
{
Console.WriteLine();
Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder \"tessdata\"");
Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast");
Console.ReadKey();
throw new Exception("Error Tesseract: " + e.Message);
}
finally
{
}
}
}
}
Option Infer On
Imports SautinSoft.Pdf
Imports SautinSoft.Pdf.Content
Imports SautinSoft.Pdf.Objects
Imports System
Imports Tesseract
Imports System.Collections.Generic
Imports System.IO
Imports System.Linq
Namespace OCR
Friend Class OCR
''' <summary>
''' Perform OCR and extract Text from scanned PDF
''' </summary>
''' <remarks>
''' Details: https://sautinsoft.com/products/pdf/help/net/developer-guide/perform-ocr-and-extract-text-from-scanned-pdf.php
''' </remarks>
Shared Sub Main()
Try
Dim tesseractLanguages As String = "eng"
Dim tesseractData As String = Path.GetFullPath(".\tessdata")
Dim tempFile As String = Path.Combine(tesseractData, Path.GetRandomFileName())
Dim pdfDocument As PdfDocument = PdfDocument.Load("..\..\..\Scanned.pdf")
Using engine As New Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default)
Dim k As Integer = 1
For Each pdfPage In pdfDocument.Pages
' INSTANT VB WARNING: An assignment within expression was extracted from the following statement:
' ORIGINAL LINE: Console.WriteLine("<Page " + (k++) + ">");
Console.WriteLine("<Page " & (k) & ">")
k += 1
Dim collection = pdfPage.Content.Elements.All().OfType(Of PdfImageContent)().ToList()
For i As Integer = 0 To collection.Count() - 1
engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto
Using ms As New MemoryStream()
collection(i).Save(ms, New ImageSaveOptions())
Dim imgBytes() As Byte = ms.ToArray()
Using img As Tesseract.Pix = Tesseract.Pix.LoadFromMemory(imgBytes)
Using page = engine.Process(img, "Serachablepdf")
Dim st = page.GetText()
Dim scale As Double = Math.Min(collection(i).Bounds.Width \ page.RegionOfInterest.Width, collection(i).Bounds.Height \ page.RegionOfInterest.Height)
Using iter = page.GetIterator()
iter.Begin()
Do
Do
Do
Do
Console.Write(iter.GetText(PageIteratorLevel.Word))
Console.Write(" "c)
If iter.IsAtFinalOf(PageIteratorLevel.TextLine, PageIteratorLevel.Word) Then
Console.WriteLine()
End If
Loop While iter.Next(PageIteratorLevel.TextLine, PageIteratorLevel.Word)
Loop While iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine)
Loop While iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para)
Loop While iter.Next(PageIteratorLevel.Block)
End Using
End Using
End Using
End Using
Next i
Next pdfPage
End Using
Catch e As Exception
Console.WriteLine()
Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder ""tessdata""")
Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast")
Console.ReadKey()
Throw New Exception("Error Tesseract: " & e.Message)
Finally
End Try
End Sub
End Class
End Namespace
If you need a new code example or have a question: email us at support@sautinsoft.com or ask at Online Chat (right-bottom corner of this page) or use the Form below: