Hi @Gani_tpt,
You can use the Tesseract OCR library
to read or extract text from images, and the iTextSharp library
to extract text from PDFs.
iTextSharp
https://www.nuget.org/packages/iTextSharp
Tesseract
https://www.nuget.org/packages/Tesseract
Downloading and configuring Tesseract Data Files
You will need to download the Tesseract Data files from the following link.
https://github.com/tesseract-ocr/tessdata
Then copy it to the project root folder and rename it to tessdata.
Simple example:
<html xmlns="http://www.w3.org/1999/xhtml">
<head runat="server">
<title></title>
</head>
<body>
<form id="form1" runat="server">
Select File:
<asp:FileUpload ID="FileUpload1" runat="server" />
<asp:Button Text="Upload" runat="server" OnClick="OnUpload" />
<hr />
<asp:Label ID="lblText" runat="server" />
</form>
</body>
</html>
using iTextSharp.text.pdf.parser;
using iTextSharp.text.pdf;
using System;
using System.Text;
using Tesseract;
namespace WebFormDemo
{
public partial class WebForm21 : System.Web.UI.Page
{
protected void Page_Load(object sender, EventArgs e)
{
}
protected void OnUpload(object sender, EventArgs e)
{
string filePath = Server.MapPath("~/Uploads/" + System.IO.Path.GetFileName(FileUpload1.PostedFile.FileName));
string ext = System.IO.Path.GetExtension(filePath);
FileUpload1.SaveAs(filePath);
string extractText = "";
if (ext.ToLower() == ".pdf")
{
extractText = this.ExtractTextFromPdf(filePath);
}
else if (ext.ToLower() == ".jpg" || ext.ToLower() == ".png" || ext.ToLower() == ".jpeg")
{
extractText = this.ExtractTextFromImage(filePath);
}
else
{
extractText = "Only images and pdf are allowed";
}
lblText.Text = extractText;
}
private string ExtractTextFromPdf(string path)
{
using (PdfReader reader = new PdfReader(path))
{
StringBuilder text = new StringBuilder();
ITextExtractionStrategy Strategy = new LocationTextExtractionStrategy();
string page = "";
for (int i = 1; i <= reader.NumberOfPages; i++)
{
page = PdfTextExtractor.GetTextFromPage(reader, i, Strategy);
}
return page;
}
}
private string ExtractTextFromImage(string filePath)
{
string path = Server.MapPath("~/") + System.IO.Path.DirectorySeparatorChar + "tessdata";
using (TesseractEngine engine = new TesseractEngine(path, "eng", EngineMode.Default))
{
using (Pix pix = Pix.LoadFromFile(filePath))
{
using (Tesseract.Page page = engine.Process(pix))
{
return page.GetText();
}
}
}
}
}
}
Best regards,
Lan Huang
If the answer is the right solution, please click "Accept Answer" and kindly upvote it. If you have extra questions about this answer, please click "Comment".
Note: Please follow the steps in our documentation to enable e-mail notifications if you want to receive the related email notification for this thread.