PdfFocusCOCROptionseOCRMode Enumeration |
Represent OCR modes: Disable (default), All images, Automatic.
Namespace: SautinSoftAssembly: SautinSoft.PdfFocus (in SautinSoft.PdfFocus.dll) Version: 2024.3.28
Syntax Public Enumeration eOCRMode
Members Member name | Value | Description |
---|
Disabled | 0 |
Don't make OCR (optical recognizing) for images at all. The images will be placed into resulting document as is.
|
AllImages | 1 |
Perform the OCR (optical recognizing) for every image.
Note, the component will consider the every image as textual data scanned or photographed and try to recognize it.
In any case (successfully or failing) recognizing (OCR) the all images will NOT be placed in the resulting document.
|
Auto | 2 |
Perform the OCR (optical recognizing) for images that looks as scanned or photographed text.
Such images (after performing OCR) will be placed in the resulting document as text.
Other images will be placed in the resulting document as images.
|
Example Perform OCR using free Tesseract SDK in C#
using System.IO;
using SautinSoft;
using System;
namespace Example
{
class Program
{
static void Main(string[] args)
{
LoadScannedPdf();
}
static void LoadScannedPdf()
{
string inpFile = Path.GetFullPath(@"..\..\..\scan.pdf");
string outFile = "Result.docx";
PdfFocus f = new PdfFocus();
f.OCROptions.Mode = PdfFocus.COCROptions.eOCRMode.AllImages;
f.OCROptions.Method += PerformOCRTesseract;
f.OpenPdf(inpFile);
bool result = false;
if (f.PageCount > 0)
{
result = f.ToWord(outFile) == 0;
}
if (result)
{
System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(outFile) { UseShellExecute = true });
}
else
Console.WriteLine("Conversion failed!");
}
public static byte[] PerformOCRTesseract(byte[] image)
{
string tesseractLanguages = "eng";
string tesseractData = Path.GetFullPath(@"..\..\..\tessdata\");
string tempFile = Path.Combine(tesseractData, Path.GetRandomFileName());
bool skipImages = true;
try
{
using (Tesseract.IResultRenderer renderer = Tesseract.PdfResultRenderer.CreatePdfRenderer(tempFile, tesseractData, skipImages))
{
using (renderer.BeginDocument("Serachablepdf"))
{
using (Tesseract.TesseractEngine engine = new Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default))
{
engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto;
using (MemoryStream msImg = new MemoryStream(image))
{
System.Drawing.Image imgWithText = System.Drawing.Image.FromStream(msImg);
for (int i = 0; i < imgWithText.GetFrameCount(System.Drawing.Imaging.FrameDimension.Page); i++)
{
imgWithText.SelectActiveFrame(System.Drawing.Imaging.FrameDimension.Page, i);
using (MemoryStream ms = new MemoryStream())
{
imgWithText.Save(ms, System.Drawing.Imaging.ImageFormat.Png);
byte[] imgBytes = ms.ToArray();
using (Tesseract.Pix img = Tesseract.Pix.LoadFromMemory(imgBytes))
{
using (var page = engine.Process(img, "Serachablepdf"))
{
renderer.AddPage(page);
}
}
}
}
}
}
}
}
return File.ReadAllBytes(tempFile + ".pdf");
}
catch (Exception e)
{
Console.WriteLine();
Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder \"tessdata\"");
Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast");
Console.ReadKey();
throw new Exception("Error Tesseract: " + e.Message);
}
finally
{
if (File.Exists(tempFile + ".pdf"))
File.Delete(tempFile + ".pdf");
}
}
}
}
Perform OCR using free Tesseract SDK in VB.Net
Option Infer On
Imports System.IO
Imports SautinSoft
Imports System
Namespace Example
Friend Class Program
Shared Sub Main(ByVal args() As String)
LoadScannedPdf()
End Sub
Private Shared Sub LoadScannedPdf()
Dim inpFile As String = "..\..\..\scan.pdf"
Dim outFile As String = "Result.docx"
Dim f As New PdfFocus()
f.OCROptions.Mode = PdfFocus.COCROptions.eOCRMode.AllImages
f.OCROptions.Method = AddressOf PerformOCRTesseract
f.OpenPdf(inpFile)
Dim result As Boolean = False
If f.PageCount > 0 Then
result = f.ToWord(outFile) = 0
End If
If result Then
System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(outFile) With {.UseShellExecute = True})
Else
Console.WriteLine("Conversion failed!")
End If
End Sub
Public Shared Function PerformOCRTesseract(ByVal image() As Byte) As Byte()
Dim tesseractLanguages As String = "eng"
Dim tesseractData As String = Path.GetFullPath("..\..\..\tessdata\")
Dim tempFile As String = Path.Combine(tesseractData, Path.GetRandomFileName())
Dim skipImages As Boolean = True
Try
Using renderer As Tesseract.IResultRenderer = Tesseract.PdfResultRenderer.CreatePdfRenderer(tempFile, tesseractData, skipImages)
Using renderer.BeginDocument("Serachablepdf")
Using engine As New Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default)
engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto
Using msImg As New MemoryStream(image)
Dim imgWithText As System.Drawing.Image = System.Drawing.Image.FromStream(msImg)
Dim i As Integer = 0
Do While i < imgWithText.GetFrameCount(System.Drawing.Imaging.FrameDimension.Page)
imgWithText.SelectActiveFrame(System.Drawing.Imaging.FrameDimension.Page, i)
Using ms As New MemoryStream()
imgWithText.Save(ms, System.Drawing.Imaging.ImageFormat.Png)
Dim imgBytes() As Byte = ms.ToArray()
Using img As Tesseract.Pix = Tesseract.Pix.LoadFromMemory(imgBytes)
Using page = engine.Process(img, "Serachablepdf")
renderer.AddPage(page)
End Using
End Using
End Using
i += 1
Loop
End Using
End Using
End Using
End Using
Return File.ReadAllBytes(tempFile & ".pdf")
Catch e As Exception
Console.WriteLine()
Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder ""tessdata""")
Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast")
Console.ReadKey()
Throw New Exception("Error Tesseract: " & e.Message)
Finally
If File.Exists(tempFile & ".pdf") Then
File.Delete(tempFile & ".pdf")
End If
End Try
End Function
End Class
End Namespace
See Also