How to launch full text search in PDF, DOCX, RTF and HTML files using C# and .NET


Here we'll show you how to use full text search in the specific directory including subdirectories.
Using regular expressions, we'll find - "video" (video, VIDEO, ViDeO, etc) in all files (DOCX, RTF, PDF and HTML) inside the specified directory and output the results to the Console.

Complete code

using System;
using System.IO;
using System.Collections.Generic;
using SautinSoft.Document;
using System.Drawing;
using System.Drawing.Imaging;
using System.Linq;
using System.Text.RegularExpressions;



namespace Sample
{
    class Sample
    {

        static void Main(string[] args)
        {
            // Get your free 30-day key here:   
            // https://sautinsoft.com/start-for-free/

            string searchDir = Path.GetFullPath(@"..\..\..\searching\");
            string searchText = "with";
            FullTextSearching(searchDir, searchText);
        }

        /// <summary>
        /// This sample shows how to launch full text search in the specific directory.
        /// </summary>
        /// <remarks>
        /// Details: https://sautinsoft.com/products/document/help/net/developer-guide/full-text-searching-in-documents-net-csharp-vb.php
        /// </remarks>
        public static void FullTextSearching(string searchPath, string searchText)
        {
            DirectoryInfo searchDir = new DirectoryInfo(searchPath);
            List<string> supportedFiles = new List<string>();

            // 1. Find theS files to make search.
            // Specify to make the search only in *.docx, *.rtf, *.pdf and *.html files,
            // including subdirectories.
            foreach (string file in Directory.GetFiles(searchDir.FullName, "*.*", SearchOption.AllDirectories))
            {
                string ext = Path.GetExtension(file).ToLower();

                if (ext == ".docx" || ext == ".pdf" || ext == ".html" || ext == ".rtf")
                    supportedFiles.Add(file);
            }

            // 2. Perform the text search in the each file using a loop.
            // We'll search the word "video" in the each and count how many times the file contains it.
            Console.WriteLine($"The results for \"{searchText}\":");

            int totalFiles = 0, totalMatches = 0;
            foreach (string file in supportedFiles)
            {
                DocumentCore dc = DocumentCore.Load(file);
                totalFiles++;
                Regex regex = new Regex($"\\b({searchText})\\b", RegexOptions.IgnoreCase);

                // Show also subfolder if we aren't in the root folder.
                DirectoryInfo dirInfo = new DirectoryInfo(Path.GetDirectoryName(file));
                string fileName = String.Empty;

                if (dirInfo.FullName.TrimEnd(new char[] { '\\' }) != searchDir.FullName.TrimEnd(new char[] { '\\' }))
                    fileName = file.Substring(searchPath.Length, file.Length - searchPath.Length);
                else
                    // We are in the root folder.
                    fileName = Path.GetFileName(file);

                int matches = dc.Content.Find(regex).Count();
                totalMatches += matches;

                Console.WriteLine($"{totalFiles:D3} from {supportedFiles.Count} {fileName} - {matches} matches.");
            }
            Console.WriteLine($"\nSearching finished. {supportedFiles.Count} file(s) has been processed. Total matches: {totalMatches}.");
            Console.WriteLine("Press any key ...");
            Console.ReadKey();
        }
    }
}

Download

Imports System
Imports System.IO
Imports System.Collections.Generic
Imports SautinSoft.Document
Imports System.Drawing
Imports System.Linq
Imports System.Text.RegularExpressions



Namespace Sample
    Friend Class Sample

        Shared Sub Main(ByVal args() As String)
            Dim searchDir As String = Path.GetFullPath("..\..\..\searching\")
            Dim searchText As String = "with"
            FullTextSearching(searchDir, searchText)
        End Sub
        ''' Get your free 30-day key here:   
        ''' https://sautinsoft.com/start-for-free/
        ''' <summary>
        ''' This sample shows how to launch full text search in the specific directory.
        ''' </summary>
        ''' <remarks>
        ''' Details: https://sautinsoft.com/products/document/help/net/developer-guide/full-text-searching-in-documents-net-csharp-vb.php
        ''' </remarks>
        Public Shared Sub FullTextSearching(ByVal searchPath As String, ByVal searchText As String)
            Dim searchDir As New DirectoryInfo(searchPath)
            Dim supportedFiles As New List(Of String)()

            ' 1. Find theS files to make search.
            ' Specify to make the search only in *.docx, *.rtf, *.pdf and *.html files,
            ' including subdirectories.
            For Each file As String In Directory.GetFiles(searchDir.FullName, "*.*", SearchOption.AllDirectories)
                Dim ext As String = Path.GetExtension(file).ToLower()

                If ext = ".docx" OrElse ext = ".pdf" OrElse ext = ".html" OrElse ext = ".rtf" Then
                    supportedFiles.Add(file)
                End If
            Next file

            ' 2. Perform the text search in the each file using a loop.
            ' We'll search the word "video" in the each and count how many times the file contains it.
            Console.WriteLine($"The results for ""{searchText}"":")

            Dim totalFiles As Integer = 0, totalMatches As Integer = 0
            For Each file As String In supportedFiles
                Dim dc As DocumentCore = DocumentCore.Load(file)
                totalFiles += 1
                Dim regex As New Regex($"\b({searchText})\b", RegexOptions.IgnoreCase)

                ' Show also subfolder if we aren't in the root folder.
                Dim dirInfo As New DirectoryInfo(Path.GetDirectoryName(file))
                Dim fileName As String = String.Empty

                If dirInfo.FullName.TrimEnd(New Char() {"\"c}) <> searchDir.FullName.TrimEnd(New Char() {"\"c}) Then
                    fileName = file.Substring(searchPath.Length, file.Length - searchPath.Length)
                Else
                    ' We are in the root folder.
                    fileName = Path.GetFileName(file)
                End If

                Dim matches As Integer = dc.Content.Find(regex).Count()
                totalMatches += matches

                Console.WriteLine($"{totalFiles:D3} from {supportedFiles.Count} {fileName} - {matches} matches.")
            Next file
            Console.WriteLine($"Searching finished. {supportedFiles.Count} file(s) has been processed. Total matches: {totalMatches}.")
            Console.WriteLine("Press any key ...")
            Console.ReadKey()
        End Sub
    End Class
End Namespace

Download


If you need a new code example or have a question: email us at support@sautinsoft.com or ask at Online Chat (right-bottom corner of this page) or use the Form below:



Questions and suggestions from you are always welcome!

We are developing .Net components since 2002. We know PDF, DOCX, RTF, HTML, XLSX and Images formats. If you need any assistance with creating, modifying or converting documents in various formats, we can help you. We will write any code example for you absolutely free.