Converting scanned PDF documents into editable Word files can be a useful task for many users. This allows you to extract text and images from PDF and edit them in Word.
In this article, we will look at how to use C# and .NET to perform this task using the SautinSoft.Pdf .NET library.
Step-by-step guide:
Input file: simple text.pdf
Complete code
using SautinSoft.Pdf;
using SautinSoft.Pdf.Content;
using SautinSoft.Pdf.Objects;
using System;
using Tesseract;
using System.Collections.Generic;
using System.IO;
using System.Linq;
namespace OCR
{
class OCR
{
/// <summary>
/// Convert scanned PDF to Word
/// </summary>
/// <remarks>
/// Details: https://sautinsoft.com/products/pdf/help/net/developer-guide/convert-scanned-pdf-to-word.php
/// </remarks>
static void Main()
{
try
{
string tesseractLanguages = "eng";
string tesseractData = Path.GetFullPath(@".\tessdata");
string tempFile = Path.Combine(tesseractData, Path.GetRandomFileName());
PdfDocument pdfDocument = PdfDocument.Load(@"..\..\..\Scanned PDF.pdf");
PdfFormattedText text = new PdfFormattedText();
using (Tesseract.TesseractEngine engine = new Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default))
{
foreach (var pdfPage in pdfDocument.Pages)
{
var collection = pdfPage.Content.Elements.All().OfType<PdfImageContent>().ToList();
for (int i = 0; i < collection.Count(); i++)
{
engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto;
using (MemoryStream ms = new MemoryStream())
{
collection[i].Save(ms, new ImageSaveOptions());
byte[] imgBytes = ms.ToArray();
using (Tesseract.Pix img = Tesseract.Pix.LoadFromMemory(imgBytes))
{
using (var page = engine.Process(img, "Serachablepdf"))
{
var st = page.GetText();
double scale = Math.Min(collection[i].Bounds.Width / page.RegionOfInterest.Width, collection[i].Bounds.Height / page.RegionOfInterest.Height);
using (var iter = page.GetIterator())
{
iter.Begin();
do
{
do
{
do
{
iter.TryGetBoundingBox(PageIteratorLevel.TextLine, out Rect liRect);
text.FontSize = liRect.Height * scale;
//text.Opacity = 0;
text.Append(iter.GetText(PageIteratorLevel.TextLine));
pdfPage.Content.DrawText(text, new PdfPoint(collection[i].Bounds.Left + liRect.X1 * scale, collection[i].Bounds.Top - liRect.Y1 * scale - text.Height));
text.Clear();
} while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
} while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para));
} while (iter.Next(PageIteratorLevel.Block));
}
}
}
}
collection[i].Collection.Remove(collection[i]);
}
}
}
pdfDocument.Save(@"Result.docx");
System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo("Result.docx") { UseShellExecute = true });
}
catch (Exception e)
{
Console.WriteLine();
Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder \"tessdata\"");
Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast");
Console.ReadKey();
throw new Exception("Error Tesseract: " + e.Message);
}
finally
{
}
}
}
}
Option Infer On
Imports SautinSoft.Pdf
Imports SautinSoft.Pdf.Content
Imports SautinSoft.Pdf.Objects
Imports System
Imports Tesseract
Imports System.Collections.Generic
Imports System.IO
Imports System.Linq
Namespace OCR
Friend Class OCR
''' <summary>
''' Convert scanned PDF to Word
''' </summary>
''' <remarks>
''' Details: https://sautinsoft.com/products/pdf/help/net/developer-guide/convert-scanned-pdf-to-word.php
''' </remarks>
Shared Sub Main()
Try
Dim tesseractLanguages As String = "eng"
Dim tesseractData As String = Path.GetFullPath(".\tessdata")
Dim tempFile As String = Path.Combine(tesseractData, Path.GetRandomFileName())
Dim pdfDocument As PdfDocument = PdfDocument.Load("..\..\..\Scanned PDF.pdf")
Dim text As New PdfFormattedText()
Using engine As New Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default)
For Each pdfPage In pdfDocument.Pages
Dim collection = pdfPage.Content.Elements.All().OfType(Of PdfImageContent)().ToList()
For i As Integer = 0 To collection.Count() - 1
engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto
Using ms As New MemoryStream()
collection(i).Save(ms, New ImageSaveOptions())
Dim imgBytes() As Byte = ms.ToArray()
Using img As Tesseract.Pix = Tesseract.Pix.LoadFromMemory(imgBytes)
Using page = engine.Process(img, "Serachablepdf")
Dim st = page.GetText()
Dim scale As Double = Math.Min(collection(i).Bounds.Width / page.RegionOfInterest.Width, collection(i).Bounds.Height / page.RegionOfInterest.Height)
Using iter = page.GetIterator()
iter.Begin()
Do
Do
Do
Dim liRect As Rect
iter.TryGetBoundingBox(PageIteratorLevel.TextLine, liRect)
text.FontSize = liRect.Height * scale
'text.Opacity = 0;
text.Append(iter.GetText(PageIteratorLevel.TextLine))
pdfPage.Content.DrawText(text, New PdfPoint(collection(i).Bounds.Left + liRect.X1 * scale, collection(i).Bounds.Top - liRect.Y1 * scale - text.Height))
text.Clear()
Loop While iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine)
Loop While iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para)
Loop While iter.Next(PageIteratorLevel.Block)
End Using
End Using
End Using
End Using
collection(i).Collection.Remove(collection(i))
Next i
Next pdfPage
End Using
pdfDocument.Save("Result.docx")
System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo("Result.docx") With {.UseShellExecute = True})
Catch e As Exception
Console.WriteLine()
Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder ""tessdata""")
Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast")
Console.ReadKey()
Throw New Exception("Error Tesseract: " & e.Message)
Finally
End Try
End Sub
End Class
End Namespace
If you need a new code example or have a question: email us at support@sautinsoft.com or ask at Online Chat (right-bottom corner of this page) or use the Form below: