PdfFocusCOCROptionseOCRMode Enumeration

Represent OCR modes: Disable (default), All images, Automatic.

Namespace: SautinSoft
Assembly: SautinSoft.PdfFocus (in SautinSoft.PdfFocus.dll) Version: 2024.3.28

Syntax

public enum eOCRMode

Public Enumeration eOCRMode

Members

Member name	Value	Description
Disabled	0	Don't make OCR (optical recognizing) for images at all. The images will be placed into resulting document as is.
AllImages	1	Perform the OCR (optical recognizing) for every image. Note, the component will consider the every image as textual data scanned or photographed and try to recognize it. In any case (successfully or failing) recognizing (OCR) the all images will NOT be placed in the resulting document.
Auto	2	Perform the OCR (optical recognizing) for images that looks as scanned or photographed text. Such images (after performing OCR) will be placed in the resulting document as text. Other images will be placed in the resulting document as images.

Example

Perform OCR using free Tesseract SDK in C#

Copy

using System.IO;
using SautinSoft;
using System;

namespace Example
{
    class Program
    {
        static void Main(string[] args)
        {
            // Note: Please rebuild the project to restore Nuget packages.

            LoadScannedPdf();
        }

        /// <summary>
        /// Load a scanned PDF document with help of Tesseract OCR (free OCR library) and save the result as DOCX document.
        /// </summary>
        static void LoadScannedPdf()
        {
            // Here we'll load a scanned PDF document (perform OCR) containing a text on English, Russian and Vietnamese.
            // Next save the OCR result as a new DOCX document.

            // First steps:

            // 1. Download data files for English, Russian and Vietnamese languages.
            // Please download the files: eng.traineddata, rus.traineddata and vie.traineddata.
            // From here (good and fast): https://github.com/tesseract-ocr/tessdata_fast
            // or (best and slow): https://github.com/tesseract-ocr/tessdata_best

            // 2. Copy the files: eng.traineddata, rus.traineddata and vie.traineddata to
            // the folder "tessdata" in the Project root.

            // 3. Be sure that the folder "tessdata" also contains "pdf.ttf" file.

            // Let's start:
            string inpFile = Path.GetFullPath(@"..\..\..\scan.pdf");
            string outFile = "Result.docx";
                                  // Get your free 30-day key here:   
             // https://sautinsoft.com/start-for-free/

            PdfFocus f = new PdfFocus();
            f.OCROptions.Mode = PdfFocus.COCROptions.eOCRMode.AllImages;
            f.OCROptions.Method += PerformOCRTesseract;

            f.OpenPdf(inpFile);
            bool result = false;
            if (f.PageCount > 0)
            {
                result = f.ToWord(outFile) == 0;
            }
            // Open the result for demonstration purposes.
            if (result)
            {                
                System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(outFile) { UseShellExecute = true });
            }
            else
                Console.WriteLine("Conversion failed!");
        }
        public static byte[] PerformOCRTesseract(byte[] image)
        {
            // Specify that Tesseract use three 3 languages: English, Russian and Vietnamese.
            //string tesseractLanguages = "rus+eng+vie";
            string tesseractLanguages = "eng";

            // A path to a folder which contains languages data files and font file "pdf.ttf".
            // Language data files can be found here:
            // Good and fast: https://github.com/tesseract-ocr/tessdata_fast
            // or
            // Best and slow: https://github.com/tesseract-ocr/tessdata_best
            // Also this folder must have write permissions.
            string tesseractData = Path.GetFullPath(@"..\..\..\tessdata\");

            // A path for a temporary PDF file (because Tesseract returns OCR result as PDF document)
            string tempFile = Path.Combine(tesseractData, Path.GetRandomFileName());

            bool skipImages = true;

            try
            {
                using (Tesseract.IResultRenderer renderer = Tesseract.PdfResultRenderer.CreatePdfRenderer(tempFile, tesseractData, skipImages))
                {
                    using (renderer.BeginDocument("Serachablepdf"))
                    {
                        using (Tesseract.TesseractEngine engine = new Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default))
                        {
                            engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto;
                            using (MemoryStream msImg = new MemoryStream(image))
                            {
                                System.Drawing.Image imgWithText = System.Drawing.Image.FromStream(msImg);
                                for (int i = 0; i < imgWithText.GetFrameCount(System.Drawing.Imaging.FrameDimension.Page); i++)
                                {
                                    imgWithText.SelectActiveFrame(System.Drawing.Imaging.FrameDimension.Page, i);
                                    using (MemoryStream ms = new MemoryStream())
                                    {
                                        imgWithText.Save(ms, System.Drawing.Imaging.ImageFormat.Png);
                                        byte[] imgBytes = ms.ToArray();
                                        using (Tesseract.Pix img = Tesseract.Pix.LoadFromMemory(imgBytes))
                                        {
                                            using (var page = engine.Process(img, "Serachablepdf"))
                                            {
                                                renderer.AddPage(page);
                                            }
                                        }
                                    }
                                }
                            }
                        }
                    }
                }

                return File.ReadAllBytes(tempFile + ".pdf");
            }
            catch (Exception e)
            {
                Console.WriteLine();
                Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder \"tessdata\"");
                Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast");
                Console.ReadKey();
                throw new Exception("Error Tesseract: " + e.Message);
            }
            finally
            {
                if (File.Exists(tempFile + ".pdf"))
                    File.Delete(tempFile + ".pdf");
            }
        }

    }
}

Perform OCR using free Tesseract SDK in VB.Net

Copy

Option Infer On

Imports System.IO
Imports SautinSoft
Imports System

Namespace Example
    Friend Class Program
        Shared Sub Main(ByVal args() As String)
            ' Note: Please rebuild the project to restore Nuget packages.

            LoadScannedPdf()
        End Sub

        ''' <summary>
        ''' Load a scanned PDF document with help of Tesseract OCR (free OCR library) and save the result as DOCX document.
        ''' </summary>
        Private Shared Sub LoadScannedPdf()
            ' Here we'll load a scanned PDF document (perform OCR) containing a text on English, Russian and Vietnamese.
            ' Next save the OCR result as a new DOCX document.

            ' First steps:

            ' 1. Download data files for English, Russian and Vietnamese languages.
            ' Please download the files: eng.traineddata, rus.traineddata and vie.traineddata.
            ' From here (good and fast): https://github.com/tesseract-ocr/tessdata_fast
            ' or (best and slow): https://github.com/tesseract-ocr/tessdata_best

            ' 2. Copy the files: eng.traineddata, rus.traineddata and vie.traineddata to
            ' the folder "tessdata" in the Project root.

            ' 3. Be sure that the folder "tessdata" also contains "pdf.ttf" file.

            ' Let's start:
            Dim inpFile As String = "..\..\..\scan.pdf"
            Dim outFile As String = "Result.docx"

            Dim f As New PdfFocus()
            f.OCROptions.Mode = PdfFocus.COCROptions.eOCRMode.AllImages
            f.OCROptions.Method = AddressOf PerformOCRTesseract

            f.OpenPdf(inpFile)
            Dim result As Boolean = False
            If f.PageCount > 0 Then
                result = f.ToWord(outFile) = 0
            End If
            ' Open the result for demonstration purposes.
            If result Then
                System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(outFile) With {.UseShellExecute = True})
            Else
                Console.WriteLine("Conversion failed!")
            End If
        End Sub
        Public Shared Function PerformOCRTesseract(ByVal image() As Byte) As Byte()
            ' Specify that Tesseract use three 3 languages: English, Russian and Vietnamese.
            'string tesseractLanguages = "rus+eng+vie";
            Dim tesseractLanguages As String = "eng"

            ' A path to a folder which contains languages data files and font file "pdf.ttf".
            ' Language data files can be found here:
            ' Good and fast: https://github.com/tesseract-ocr/tessdata_fast
            ' or
            ' Best and slow: https://github.com/tesseract-ocr/tessdata_best
            ' Also this folder must have write permissions.
            Dim tesseractData As String = Path.GetFullPath("..\..\..\tessdata\")

            ' A path for a temporary PDF file (because Tesseract returns OCR result as PDF document)
            Dim tempFile As String = Path.Combine(tesseractData, Path.GetRandomFileName())

            Dim skipImages As Boolean = True

            Try
                Using renderer As Tesseract.IResultRenderer = Tesseract.PdfResultRenderer.CreatePdfRenderer(tempFile, tesseractData, skipImages)
                    Using renderer.BeginDocument("Serachablepdf")
                        Using engine As New Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default)
                            engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto
                            Using msImg As New MemoryStream(image)
                                Dim imgWithText As System.Drawing.Image = System.Drawing.Image.FromStream(msImg)
                                Dim i As Integer = 0
                                Do While i < imgWithText.GetFrameCount(System.Drawing.Imaging.FrameDimension.Page)
                                    imgWithText.SelectActiveFrame(System.Drawing.Imaging.FrameDimension.Page, i)
                                    Using ms As New MemoryStream()
                                        imgWithText.Save(ms, System.Drawing.Imaging.ImageFormat.Png)
                                        Dim imgBytes() As Byte = ms.ToArray()
                                        Using img As Tesseract.Pix = Tesseract.Pix.LoadFromMemory(imgBytes)
                                            Using page = engine.Process(img, "Serachablepdf")
                                                renderer.AddPage(page)
                                            End Using
                                        End Using
                                    End Using
                                    i += 1
                                Loop
                            End Using
                        End Using
                    End Using
                End Using

                Return File.ReadAllBytes(tempFile & ".pdf")
            Catch e As Exception
                Console.WriteLine()
                Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder ""tessdata""")
                Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast")
                Console.ReadKey()
                Throw New Exception("Error Tesseract: " & e.Message)
            Finally
                If File.Exists(tempFile & ".pdf") Then
                    File.Delete(tempFile & ".pdf")
                End If
            End Try
        End Function
    End Class
End Namespace

Reference

SautinSoft Namespace