i was able to get the images of a page from below code, but still unable to find the text.
write below code in any click event
PdfDocument document = PdfReader.Open("C:\\HelloWorld.pdf", PdfDocumentOpenMode.ReadOnly);
int imageCount = 0;
// Iterate pages
foreach (PdfPage page in document.Pages)
{
// Get resources dictionary
PdfDictionary resources = page.Elements.GetDictionary("/Resources");
if (resources != null)
{
// Get external objects dictionary
PdfDictionary xObjects = resources.Elements.GetDictionary("/XObject");
if (xObjects != null)
{
PdfItem[] items = xObjects.Elements.Values;
// Iterate references to external objects
foreach (PdfItem item in items)
{
PdfReference reference = item as PdfReference;
if (reference != null)
{
PdfDictionary xObject = reference.Value as PdfDictionary;
// Is external object an image?
if (xObject != null && xObject.Elements.GetString("/Subtype") == "/Image")
{
imageCount++;
ExportImage(xObject, imageCount);
}
}
}
}
}
}
the following functions are used:
/// <summary>
/// Currently extracts only JPEG images.
/// </summary>
static void ExportImage(PdfDictionary image, int count)
{
string filter = image.Elements.GetName("/Filter");
switch (filter)
{
case "/DCTDecode":
ExportJpegImage(image, count);
break;
case "/FlateDecode":
ExportAsPngImage(image, count);
break;
}
}
/// <summary>
/// Exports a JPEG image.
/// </summary>
static void ExportJpegImage(PdfDictionary image, int count)
{
// Fortunately JPEG has native support in PDF and exporting an image is just writing the stream to a file.
byte[] stream = image.Stream.Value;
//FileStream fs = new FileStream(String.Format("Image{0}.jpeg", count++), FileMode.Create, FileAccess.Write);
//fs.Read(
//BinaryWriter bw = new BinaryWriter(fs);
//bw.Write(stream);
File.WriteAllBytes("C:\\poc_image_" + count.ToString() + ".jpeg", stream);
//bw.Close();
}
|