PDF Content Groups allow you to organize the content elements of a PDF document in such a way that they can be transformed and/or cropped together without affecting other parts of the document.
In this article, we will look at how to create and use PDF content groups using the SautinSoft.Pdf library .Net in C# and .NET.
Input file: simple text.png
Complete code
using Net.Pkcs11Interop.HighLevelAPI.MechanismParams;
using SautinSoft.Pdf;
using SautinSoft.Pdf.Content;
using SautinSoft.Pdf.Objects;
using System;
using Tesseract;
using System.Collections.Generic;
using System.IO;
using System.Linq;
namespace OCR
{
class OCR
{
/// <summary>
/// Convert scanned Image to searchable PDF
/// </summary>
/// <remarks>
/// Details: https://sautinsoft.com/products/pdf/help/net/developer-guide/convert-scanned-image-to-searchable-pdf.php
/// </remarks>
static void Main()
{
try
{
string tesseractLanguages = "eng";
string tesseractData = Path.GetFullPath(@".\tessdata");
string tempFile = Path.Combine(tesseractData, Path.GetRandomFileName());
PdfDocument pdfDocument = new PdfDocument();
var pdfPage = pdfDocument.Pages.Add();
PdfFormattedText text = new PdfFormattedText();
using (Tesseract.TesseractEngine engine = new Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default))
{
engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto;
using (MemoryStream ms = new MemoryStream())
{
(new FileStream(@"..\..\..\Potato Beetle.png", FileMode.Open)).CopyTo(ms);
byte[] imgBytes = ms.ToArray();
using (Tesseract.Pix img = Tesseract.Pix.LoadFromMemory(imgBytes))
{
using (var page = engine.Process(img, "Serachablepdf"))
{
var st = page.GetText();
double scale = Math.Min(pdfPage.Size.Width / page.RegionOfInterest.Width, pdfPage.Size.Height / page.RegionOfInterest.Height);
using (var iter = page.GetIterator())
{
iter.Begin();
do
{
do
{
do
{
iter.TryGetBoundingBox(PageIteratorLevel.TextLine, out Rect liRect);
text.FontSize = liRect.Height * scale;
//text.Opacity = 0;
text.Append(iter.GetText(PageIteratorLevel.TextLine));
pdfPage.Content.DrawText(text, new PdfPoint(liRect.X1 * scale, pdfPage.Size.Height - liRect.Y1 * scale - text.Height));
text.Clear();
} while (iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine));
} while (iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para));
} while (iter.Next(PageIteratorLevel.Block));
}
}
}
}
}
pdfDocument.Save(@"Result.pdf");
System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo("Result.pdf") { UseShellExecute = true });
}
catch (Exception e)
{
Console.WriteLine();
Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder \"tessdata\"");
Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast");
Console.ReadKey();
throw new Exception("Error Tesseract: " + e.Message);
}
finally
{
}
}
}
}
Option Infer On
Imports Net.Pkcs11Interop.HighLevelAPI.MechanismParams
Imports SautinSoft.Pdf
Imports SautinSoft.Pdf.Content
Imports SautinSoft.Pdf.Objects
Imports System
Imports Tesseract
Imports System.Collections.Generic
Imports System.IO
Imports System.Linq
Namespace OCR
Friend Class OCR
''' <summary>
''' Convert scanned Image to searchable PDF
''' </summary>
''' <remarks>
''' Details: https://sautinsoft.com/products/pdf/help/net/developer-guide/convert-scanned-image-to-searchable-pdf.php
''' </remarks>
Shared Sub Main()
Try
Dim tesseractLanguages As String = "eng"
Dim tesseractData As String = Path.GetFullPath(".\tessdata")
Dim tempFile As String = Path.Combine(tesseractData, Path.GetRandomFileName())
Dim pdfDocument As New PdfDocument()
Dim pdfPage = pdfDocument.Pages.Add()
Dim text As New PdfFormattedText()
Using engine As New Tesseract.TesseractEngine(tesseractData, tesseractLanguages, Tesseract.EngineMode.Default)
engine.DefaultPageSegMode = Tesseract.PageSegMode.Auto
Using ms As New MemoryStream()
Call (New FileStream("..\..\..\Potato Beetle.png", FileMode.Open)).CopyTo(ms)
Dim imgBytes() As Byte = ms.ToArray()
Using img As Tesseract.Pix = Tesseract.Pix.LoadFromMemory(imgBytes)
Using page = engine.Process(img, "Serachablepdf")
Dim st = page.GetText()
Dim scale As Double = Math.Min(pdfPage.Size.Width / page.RegionOfInterest.Width, pdfPage.Size.Height / page.RegionOfInterest.Height)
Using iter = page.GetIterator()
iter.Begin()
Do
Do
Do
Dim liRect As Rect
iter.TryGetBoundingBox(PageIteratorLevel.TextLine, liRect)
text.FontSize = liRect.Height * scale
'text.Opacity = 0;
text.Append(iter.GetText(PageIteratorLevel.TextLine))
pdfPage.Content.DrawText(text, New PdfPoint(liRect.X1 * scale, pdfPage.Size.Height - liRect.Y1 * scale - text.Height))
text.Clear()
Loop While iter.Next(PageIteratorLevel.Para, PageIteratorLevel.TextLine)
Loop While iter.Next(PageIteratorLevel.Block, PageIteratorLevel.Para)
Loop While iter.Next(PageIteratorLevel.Block)
End Using
End Using
End Using
End Using
End Using
pdfDocument.Save("Result.pdf")
System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo("Result.pdf") With {.UseShellExecute = True})
Catch e As Exception
Console.WriteLine()
Console.WriteLine("Please be sure that you have Language data files (*.traineddata) in your folder ""tessdata""")
Console.WriteLine("The Language data files can be download from here: https://github.com/tesseract-ocr/tessdata_fast")
Console.ReadKey()
Throw New Exception("Error Tesseract: " & e.Message)
Finally
End Try
End Sub
End Class
End Namespace
If you need a new code example or have a question: email us at support@sautinsoft.com or ask at Online Chat (right-bottom corner of this page) or use the Form below: