Transform Scanned PDF to Word in C# and .NET

Converting scanned PDF documents to Word files can be a difficult task, especially if the document contains text in another language. In this article, we will look at how to use SautinSoft.Pdf. NET to perform this task using C# and .NET.

Step-by-step guide:

  1. Load the required language file.
  2. Add SautinSoft.PDF from NuGet.
  3. Load a PDF document.
  4. Extract the images from the first page.
  5. Perform the OCR of the first page.
  6. Save the document in DOCX format.

Input file: simple text.pdf

Output result:

Complete code

using SautinSoft.Pdf;
using SautinSoft.Pdf.Content;
using SautinSoft.Pdf.Objects;
using System;
using Tesseract;
using System.Collections.Generic;
using System.IO;
using System.Linq;

namespace OCR
{
    class OCR
    {
        /// <summary>
        /// Convert scanned PDF with other language to Word
        /// </summary>
        /// <remarks>
        /// Details: https://sautinsoft.com/products/pdf/help/net/developer-guide/convert-scanned-pdf-with-other-language-to-word.php
        /// </remarks>
        static void Main()
        {
            try
            {
                //Before executing, download the required language file from the link: https://github.com/tesseract-ocr/tessdata/tree/main
                //Place the file in a folder convenient for you and specify the path to it.

                string tesseractLanguages = "ron";
                string tesseractData = Path.GetFullPath(@"..\..\..\tessdata");
                string tempFile = Path.Combine(tesseractData, Path.GetRandomFileName());
                PdfDocument pdfDocument = PdfDocument.Load(@"..\..\..\ARGW64125SX.pdf");
                PdfDocument pdfDocument1 = new PdfDocument();
                PdfFormattedText text = new PdfFormattedText();

                using (Tesseract.TesseractEngine engine = new Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default))
                {
                    var collection = pdfDocument.Pages[0].Content.Elements.All().OfType<PdfImageContent>().ToList();
                    var pdfPage1 = pdfDocument1.Pages.Add();
                    pdfPage1.SetMediaBox(pdfDocument.Pages[0].MediaBox.Left, pdfDocument.Pages[0].MediaBox.Bottom, pdfDocument.Pages[0].MediaBox.Right, pdfDocument.Pages[0].MediaBox.Top);
                    for (int i = 0; i < collection.Count(); i++)
                    {
                        engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto;
                        using (MemoryStream ms = new MemoryStream())
                        {
                            collection[i].Save(ms, new ImageSaveOptions());

                            byte[] imgBytes = ms.ToArray();
                            using (Tesseract.Pix img = Tesseract.Pix.LoadFromMemory(imgBytes))
                            {
                                using (var page = engine.Process(img, "Serachablepdf"))
                                {
                                    var st = page.GetText();
                                    double scale = Math.Min(collection[i].Bounds.Width / page.RegionOfInterest.Width, collection[i].Bounds.Height / page.RegionOfInterest.Height);

                                    using (var iter = page.GetIterator())
                                    {
                                        iter.Begin();

                                        do
                                        {
                                            do
                                            {
                                                do
                                                {
                                                    iter.TryGetBoundingBox(PageIteratorLevel.TextLine, out Rect liRect);
                                                    text.FontSize = liRect.Height * scale;
                                                    text.Append(iter.GetText(PageIteratorLevel.TextLine));
                                                    pdfPage1.Content.DrawText(text, new PdfPoint(collection[i].Bounds.Left + liRect.X1 * scale, collection[i].Bounds.Top - liRect.Y1 * scale - text.Height));
                                                    text.Clear();
                                                } while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
                                            } while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para));
                                        } while (iter.Next(PageIteratorLevel.Block));
                                    }
                                }
                            }
                        }
                        collection[i].Collection.Remove(collection[i]);
                    }
                }
                pdfDocument1.Save(@"text.docx");
                System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(@"text.docx") { UseShellExecute = true });
            }
            catch (Exception e)
            {
                Console.WriteLine();
                Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder \"tessdata\"");
                Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast");
                Console.ReadKey();
                throw new Exception("Error Tesseract: " + e.Message);
            }
            finally
            {

            }
        }
    }
}

Download

Option Infer On

Imports SautinSoft.Pdf
Imports SautinSoft.Pdf.Content
Imports SautinSoft.Pdf.Objects
Imports System
Imports Tesseract
Imports System.Collections.Generic
Imports System.IO
Imports System.Linq

Namespace OCR
	Friend Class OCR
		''' <summary>
		''' Convert scanned PDF with other language to Word
		''' </summary>
		''' <remarks>
		''' Details: https://sautinsoft.com/products/pdf/help/net/developer-guide/convert-scanned-pdf-with-other-language-to-word.php
		''' </remarks>
		Shared Sub Main()
			Try
				'Before executing, download the required language file from the link: https://github.com/tesseract-ocr/tessdata/tree/main
				'Place the file in a folder convenient for you and specify the path to it.

				Dim tesseractLanguages As String = "ron"
				Dim tesseractData As String = Path.GetFullPath("..\..\..\tessdata")
				Dim tempFile As String = Path.Combine(tesseractData, Path.GetRandomFileName())
				Dim pdfDocument As PdfDocument = PdfDocument.Load("..\..\..\ARGW64125SX.pdf")
				Dim pdfDocument1 As New PdfDocument()
				Dim text As New PdfFormattedText()

				Using engine As New Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default)
					Dim collection = pdfDocument.Pages(0).Content.Elements.All().OfType(Of PdfImageContent)().ToList()
					Dim pdfPage1 = pdfDocument1.Pages.Add()
					pdfPage1.SetMediaBox(pdfDocument.Pages(0).MediaBox.Left, pdfDocument.Pages(0).MediaBox.Bottom, pdfDocument.Pages(0).MediaBox.Right, pdfDocument.Pages(0).MediaBox.Top)
					For i As Integer = 0 To collection.Count() - 1
						engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto
						Using ms As New MemoryStream()
							collection(i).Save(ms, New ImageSaveOptions())

							Dim imgBytes() As Byte = ms.ToArray()
							Using img As Tesseract.Pix = Tesseract.Pix.LoadFromMemory(imgBytes)
								Using page = engine.Process(img, "Serachablepdf")
									Dim st = page.GetText()
									Dim scale As Double = Math.Min(collection(i).Bounds.Width / page.RegionOfInterest.Width, collection(i).Bounds.Height / page.RegionOfInterest.Height)

									Using iter = page.GetIterator()
										iter.Begin()

										Do
											Do
												Do
													Dim liRect As Rect
													iter.TryGetBoundingBox(PageIteratorLevel.TextLine, liRect)
													text.FontSize = liRect.Height * scale
													text.Append(iter.GetText(PageIteratorLevel.TextLine))
													pdfPage1.Content.DrawText(text, New PdfPoint(collection(i).Bounds.Left + liRect.X1 * scale, collection(i).Bounds.Top - liRect.Y1 * scale - text.Height))
													text.Clear()
												Loop While iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine)
											Loop While iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para)
										Loop While iter.Next(PageIteratorLevel.Block)
									End Using
								End Using
							End Using
						End Using
						collection(i).Collection.Remove(collection(i))
					Next i
				End Using
				pdfDocument1.Save("text.docx")
				System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo("text.docx") With {.UseShellExecute = True})
			Catch e As Exception
				Console.WriteLine()
				Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder ""tessdata""")
				Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast")
				Console.ReadKey()
				Throw New Exception("Error Tesseract: " & e.Message)
			Finally

			End Try
		End Sub
	End Class
End Namespace

Download


If you need a new code example or have a question: email us at support@sautinsoft.com or ask at Online Chat (right-bottom corner of this page) or use the Form below:



Questions and suggestions from you are always welcome!

We are developing .Net components since 2002. We know PDF, DOCX, RTF, HTML, XLSX and Images formats. If you need any assistance with creating, modifying or converting documents in various formats, we can help you. We will write any code example for you absolutely free.