Optical character recognition (OCR) in C# and VB.NET

Since version 7.0, PDF Focus .Net can work with OCR. To perform OCR we'll use free OCR library by Nicomsoft (https://www.nicomsoft.com).
This library is freeware and can be used in commercial applications.

You need download Nicomsoft OCR SDK from: free_NSOCR_v70_build885_full.exe



In addition, install on the PC or server side.

Notice: Please make sure, that you have installed this package (Nicomsoft OCR SDK) on your computer. And also, when creating your solution, you must place the file "NSOCR.cs" from C:\Program Files (x86)\Nicomsoft OCR\Samples in the directory of your solution. After copying "NSOCR.cs", there will be no errors with the "using NSOCR_NameSpace;".

How to use SautinSoft.Pdf with Optical Character Recognition (OCR).

using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Threading.Tasks;
using System.IO;
using SautinSoft;
using NSOCR_NameSpace;
using System.Drawing.Imaging;


namespace Sample
{
    public class PdfConverter
    {
        internal NSOCRLib.NSOCRClass NsOCR;
        internal int CfgObj = 0;
        internal int OcrObj = 0;
        internal int ImgObj = 0;
        internal int ScanObj = 0;
        internal int SvrObj = 0;
        internal bool OCRCreated = false;

        /// <summary>
        /// Converts PDF to DOCX, RTF, HTML, Text with OCR engine.
        /// </summary>
        public void ConvertPdfToAllWithOCR(string pdfPath)
        {
            // To perform OCR we'll use free OCR library by Nicomsoft.
            // https://www.nicomsoft.com/products/ocr/download/
            // The library is freeware and can be used in commercial application.
            // Also you have to insert this key:  AB2A4DD5FF2A.
            NsOCR = new NSOCRLib.NSOCRClass();
            // NsOCR.Engine_SetLicenseKey("AB2A4DD5FF2A"); //required for licensed version only
            NsOCR.Engine_InitializeAdvanced(out CfgObj, out OcrObj, out ImgObj);

            SautinSoft.Pdf f = new SautinSoft.Pdf();
            f.OCROptions.Method = PerformOCR;
            f.OCROptions.Mode = PdfDocument.COCROptions.eOCRMode.AllImages;
            f.WordOptions.KeepCharScaleAndSpacing = false;

            string pdfFile = pdfPath;
            string outFile = String.Empty;

            f.OpenPdf(pdfFile);
            if (f.PageCount > 0)
            {
                // To Docx.
                outFile = "Result.docx";
                f.WordOptions.Format = PdfDocument.CWordOptions.eWordDocument.Docx;
                if (f.ToWord(outFile) == 0)
                    System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(outFile) { UseShellExecute = true });

                // To HTML.
                outFile = "Result.html";
                f.HtmlOptions.KeepCharScaleAndSpacing = false;
                if (f.ToHtml(outFile) == 0)
                    System.Diagnostics.Process.Start(new System.Diagnostics.ProcessStartInfo(outFile) { UseShellExecute = true });
            }
            else
            {
                Console.WriteLine("Error: {0}!", f.Exception.Message);
                Console.ReadLine();
            }
        }
        private byte[] PerformOCR(System.Drawing.Image scanned)
        {
            try
            {
                int res = 0;

                try
                {
                    NsOCR.Cfg_SetOption(CfgObj, TNSOCR.BT_DEFAULT, "Languages/English", "1");

                    Array imgArray = null;
                    using (MemoryStream ms = new MemoryStream())
                    {
                        scanned.Save(ms, ImageFormat.Png);
                        ms.Flush();
                        imgArray = ms.ToArray();
                    }
                    res = NsOCR.Img_LoadFromMemory(ImgObj, ref imgArray, imgArray.Length);
                    if (res > TNSOCR.ERROR_FIRST)
                        return null;

                    NsOCR.Svr_Create(CfgObj, TNSOCR.SVR_FORMAT_PDF, out SvrObj);
                    NsOCR.Svr_NewDocument(SvrObj);

                    res = NsOCR.Img_OCR(ImgObj, TNSOCR.OCRSTEP_FIRST, TNSOCR.OCRSTEP_LAST, TNSOCR.OCRFLAG_NONE);
                    if (res > TNSOCR.ERROR_FIRST)
                        return null;

                    res = NsOCR.Svr_AddPage(SvrObj, ImgObj, TNSOCR.FMT_EXACTCOPY);
                    if (res > TNSOCR.ERROR_FIRST) return null;

                    Array outPdf = null;
                    NsOCR.Svr_SaveToMemory(SvrObj, out outPdf);

                    return (byte[])outPdf;

                }
                finally
                {

                }
            }
            catch
            {
                return null;
            }
        }
    }
    class Sample
    {
        static void Main(string[] args)
        {
            // To perform OCR we'll use free OCR library by Nicomsoft.
            // https://www.nicomsoft.com/products/ocr/download/
            // The library is freeware and can be used in commercial application.

            PdfConverter converter = new PdfConverter();
            string inpFile = Path.GetFullPath(@"..\..\scan.pdf");
            converter.ConvertPdfToAllWithOCR(inpFile);

            // You are trying to compile this code sample and see the errors:
            // NSOCRClass: Engine_SetLicenseKey
            // PdfDocument: OCROptions
            //
            // 1. Download Nicomsoft OCR SDK from: http://www.nicomsoft.com/files/ocr/free_NSOCR_v70_build885_full.exe
            // 2. Install it on your PC or server-side.
            // 3. Launch code sample again and enjoy!

            // Please, read the full manual - How to use PDF Focus .Net with OCR (Readme.html)
            // IMPORTANT: PDF Focus .Net supports OCR since version 7.0
        }
    }
}

            Imports System
Imports System.Collections.Generic
Imports System.Linq
Imports System.Text
Imports System.Threading.Tasks
Imports System.IO
Imports SautinSoft
Imports NSOCR_NameSpace
Imports System.Drawing.Imaging

Namespace Sample
    Public Class PdfConverter
        Friend NsOCR As NSOCRLib.NSOCRClass
        Friend CfgObj As Integer = 0
        Friend OcrObj As Integer = 0
        Friend ImgObj As Integer = 0
        Friend ScanObj As Integer = 0
        Friend SvrObj As Integer = 0
        Friend OCRCreated As Boolean = False

        ''' <summary>
        ''' Converts PDF to DOCX, RTF, HTML, Text with OCR engine.
        ''' </summary>
        Public Sub ConvertPdfToAllWithOCR(ByVal pdfPath As String)
            ' To perform OCR we'll use free OCR library by Nicomsoft.
            ' https://www.nicomsoft.com/products/ocr/download/
            ' The library is freeware and can be used in commercial application.
            ' Also you have to insert this key:  AB2A4DD5FF2A.
            NsOCR = New NSOCRLib.NSOCRClass()
            'NsOCR.Engine_SetLicenseKey("AB2A4DD5FF2A") 'required for licensed version only
            NsOCR.Engine_InitializeAdvanced(CfgObj, OcrObj, ImgObj)

            Dim f As New SautinSoft.Pdf()
            f.OCROptions.Method = AddressOf PerformOCR
            f.OCROptions.Mode = PdfDocument.COCROptions.eOCRMode.AllImages
            f.WordOptions.KeepCharScaleAndSpacing = False

            Dim pdfFile As String = pdfPath
            Dim outFile As String = String.Empty

            f.OpenPdf(pdfFile)
            If f.PageCount > 0 Then
                ' To Docx.
                outFile = "Result.docx"
                f.WordOptions.Format = PdfDocument.CWordOptions.eWordDocument.Docx
                If f.ToWord(outFile) = 0 Then
                    System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(outFile) With {.UseShellExecute = True})
                End If

                ' To HTML.
                outFile = "Result.html"
                f.HtmlOptions.KeepCharScaleAndSpacing = False
                If f.ToHtml(outFile) = 0 Then
                    System.Diagnostics.Process.Start(New System.Diagnostics.ProcessStartInfo(outFile) With {.UseShellExecute = True})
                End If
            Else
                Console.WriteLine("Error: {0}!", f.Exception.Message)
                Console.ReadLine()
            End If
        End Sub
        Private Function PerformOCR(ByVal scanned As System.Drawing.Image) As Byte()
            Try
                Dim res As Integer = 0

                Try
                    NsOCR.Cfg_SetOption(CfgObj, TNSOCR.BT_DEFAULT, "Languages/English", "1")

                    Dim imgArray As Array = Nothing
                    Using ms As New MemoryStream()
                        scanned.Save(ms, ImageFormat.Png)
                        ms.Flush()
                        imgArray = ms.ToArray()
                    End Using
                    res = NsOCR.Img_LoadFromMemory(ImgObj, imgArray, imgArray.Length)
                    If res > TNSOCR.ERROR_FIRST Then
                        Return Nothing
                    End If

                    NsOCR.Svr_Create(CfgObj, TNSOCR.SVR_FORMAT_PDF, SvrObj)
                    NsOCR.Svr_NewDocument(SvrObj)

                    res = NsOCR.Img_OCR(ImgObj, TNSOCR.OCRSTEP_FIRST, TNSOCR.OCRSTEP_LAST, TNSOCR.OCRFLAG_NONE)
                    If res > TNSOCR.ERROR_FIRST Then
                        Return Nothing
                    End If

                    res = NsOCR.Svr_AddPage(SvrObj, ImgObj, TNSOCR.FMT_EXACTCOPY)
                    If res > TNSOCR.ERROR_FIRST Then
                        Return Nothing
                    End If

                    Dim outPdf As Array = Nothing
                    NsOCR.Svr_SaveToMemory(SvrObj, outPdf)

                    Return CType(outPdf, Byte())

                Finally

                End Try
            Catch
                Return Nothing
            End Try
        End Function
    End Class
    Friend Class Sample
        Shared Sub Main(ByVal args() As String)
            ' To perform OCR we'll use free OCR library by Nicomsoft.
            ' https://www.nicomsoft.com/products/ocr/download/
            ' The library is freeware and can be used in commercial application.

            Dim converter As New PdfConverter()
            Dim inpFile As String = Path.GetFullPath("..\scan.pdf")
            converter.ConvertPdfToAllWithOCR(inpFile)

            ' You are trying to compile this code sample and see the errors:
            ' NSOCRClass: Engine_SetLicenseKey
            ' PdfDocument: OCROptions
            '
            ' 1. Download Nicomsoft OCR SDK from: http://www.nicomsoft.com/files/ocr/free_NSOCR_v70_build885_full.exe
            ' 2. Install it on your PC or server-side.
            ' 3. Launch code sample again and enjoy!

            ' Please, read the full manual - How to use PDF Focus .Net with OCR (Readme.html)
            ' IMPORTANT: PDF Focus .Net supports OCR since version 7.0
        End Sub
    End Class
End Namespace

Download.


If you need a new code example or have a question: email us at support@sautinsoft.com or ask at Online Chat (right-bottom corner of this page) or use the Form below:



Questions and suggestions from you are always welcome!

We are developing .Net components since 2002. We know PDF, DOCX, RTF, HTML, XLSX and Images formats. If you need any assistance with creating, modifying or converting documents in various formats, we can help you. We will write any code example for you absolutely free.