Change Scanned PDF to Editable Word in C# and .NET

Converting scanned PDF documents into editable Word files can be a useful task for many users. This allows you to extract text and images from PDF and edit them in Word.

In this article, we will look at how to use C# and .NET to perform this task using the SautinSoft.Pdf .NET library.

Step-by-step guide:

  1. Add SautinSoft.PDF from NuGet.
  2. Load a scanned PDF document.
  3. Apply OCR and extract the text.
  4. Save the document in DOCX format.

Input file: simple text.pdf

Output result:

Complete code

using SautinSoft.Pdf;
using SautinSoft.Pdf.Content;
using SautinSoft.Pdf.Objects;
using System;
using Tesseract;
using System.Collections.Generic;
using System.IO;
using System.Linq;

namespace OCR
{
    class OCR
    {
        /// <summary>
        /// Convert scanned PDF to Word
        /// </summary>
        /// <remarks>
        /// Details: https://sautinsoft.com/products/pdf/help/net/developer-guide/convert-scanned-pdf-to-word.php
        /// </remarks>
        static void Main()
        {
            try
            {
                string tesseractLanguages = "eng";
                string tesseractData = Path.GetFullPath(@".\tessdata");
                string tempFile = Path.Combine(tesseractData, Path.GetRandomFileName());
                PdfDocument pdfDocument = PdfDocument.Load(@"..\..\..\Scanned PDF.pdf");
                PdfFormattedText text = new PdfFormattedText();

                using (Tesseract.TesseractEngine engine = new Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default))
                {
                    foreach (var pdfPage in pdfDocument.Pages)
                    {
                        var collection = pdfPage.Content.Elements.All().OfType<PdfImageContent>().ToList();
                        for (int i = 0; i < collection.Count(); i++)
                        {
                            engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto;
                            using (MemoryStream ms = new MemoryStream())
                            {
                                collection[i].Save(ms, new ImageSaveOptions());

                                byte[] imgBytes = ms.ToArray();
                                using (Tesseract.Pix img = Tesseract.Pix.LoadFromMemory(imgBytes))
                                {
                                    using (var page = engine.Process(img, "Serachablepdf"))
                                    {
                                        var st = page.GetText();
                                        double scale = Math.Min(collection[i].Bounds.Width / page.RegionOfInterest.Width, collection[i].Bounds.Height / page.RegionOfInterest.Height);

                                        using (var iter = page.GetIterator())
                                        {
                                            iter.Begin();

                                            do
                                            {
                                                do
                                                {
                                                    do
                                                    {
                                                        iter.TryGetBoundingBox(PageIteratorLevel.TextLine, out Rect liRect);
                                                        text.FontSize = liRect.Height * scale;
                                                        //text.Opacity = 0;
                                                        text.Append(iter.GetText(PageIteratorLevel.TextLine));
                                                        pdfPage.Content.DrawText(text, new PdfPoint(collection[i].Bounds.Left + liRect.X1 * scale, collection[i].Bounds.Top - liRect.Y1 * scale - text.Height));
                                                        text.Clear();
                                                    } while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
                                                } while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para));
                                            } while (iter.Next(PageIteratorLevel.Block));
                                        }
                                    }
                                }
                            }
                            collection[i].Collection.Remove(collection[i]);
                        }
                    }
                }
                pdfDocument.Save(@"Result.docx");
                System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo("Result.docx") { UseShellExecute = true });
            }
            catch (Exception e)
            {
                Console.WriteLine();
                Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder \"tessdata\"");
                Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast");
                Console.ReadKey();
                throw new Exception("Error Tesseract: " + e.Message);
            }
            finally
            {

            }
        }
    }
}

Download

Option Infer On

Imports SautinSoft.Pdf
Imports SautinSoft.Pdf.Content
Imports SautinSoft.Pdf.Objects
Imports System
Imports Tesseract
Imports System.Collections.Generic
Imports System.IO
Imports System.Linq

Namespace OCR
	Friend Class OCR
		''' <summary>
		''' Convert scanned PDF to Word
		''' </summary>
		''' <remarks>
		''' Details: https://sautinsoft.com/products/pdf/help/net/developer-guide/convert-scanned-pdf-to-word.php
		''' </remarks>
		Shared Sub Main()
			Try
				Dim tesseractLanguages As String = "eng"
				Dim tesseractData As String = Path.GetFullPath(".\tessdata")
				Dim tempFile As String = Path.Combine(tesseractData, Path.GetRandomFileName())
				Dim pdfDocument As PdfDocument = PdfDocument.Load("..\..\..\Scanned PDF.pdf")
				Dim text As New PdfFormattedText()

				Using engine As New Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default)
					For Each pdfPage In pdfDocument.Pages
						Dim collection = pdfPage.Content.Elements.All().OfType(Of PdfImageContent)().ToList()
						For i As Integer = 0 To collection.Count() - 1
							engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto
							Using ms As New MemoryStream()
								collection(i).Save(ms, New ImageSaveOptions())

								Dim imgBytes() As Byte = ms.ToArray()
								Using img As Tesseract.Pix = Tesseract.Pix.LoadFromMemory(imgBytes)
									Using page = engine.Process(img, "Serachablepdf")
										Dim st = page.GetText()
										Dim scale As Double = Math.Min(collection(i).Bounds.Width / page.RegionOfInterest.Width, collection(i).Bounds.Height / page.RegionOfInterest.Height)

										Using iter = page.GetIterator()
											iter.Begin()

											Do
												Do
													Do
														Dim liRect As Rect
														iter.TryGetBoundingBox(PageIteratorLevel.TextLine, liRect)
														text.FontSize = liRect.Height * scale
														'text.Opacity = 0;
														text.Append(iter.GetText(PageIteratorLevel.TextLine))
														pdfPage.Content.DrawText(text, New PdfPoint(collection(i).Bounds.Left + liRect.X1 * scale, collection(i).Bounds.Top - liRect.Y1 * scale - text.Height))
														text.Clear()
													Loop While iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine)
												Loop While iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para)
											Loop While iter.Next(PageIteratorLevel.Block)
										End Using
									End Using
								End Using
							End Using
							collection(i).Collection.Remove(collection(i))
						Next i
					Next pdfPage
				End Using
				pdfDocument.Save("Result.docx")
				System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo("Result.docx") With {.UseShellExecute = True})
			Catch e As Exception
				Console.WriteLine()
				Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder ""tessdata""")
				Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast")
				Console.ReadKey()
				Throw New Exception("Error Tesseract: " & e.Message)
			Finally

			End Try
		End Sub
	End Class
End Namespace

Download


If you need a new code example or have a question: email us at support@sautinsoft.com or ask at Online Chat (right-bottom corner of this page) or use the Form below:



Questions and suggestions from you are always welcome!

We are developing .Net components since 2002. We know PDF, DOCX, RTF, HTML, XLSX and Images formats. If you need any assistance with creating, modifying or converting documents in various formats, we can help you. We will write any code example for you absolutely free.