Good day
I'm trying to play with pdf and PDFsharp & MigraDoc but i am stuck and dont know how to extract the desired text. all text from pdf i can get
i have pdf and want get text NONE(or some value like 10). This value changes and i it will be None or some int
at first I thought to find Points and get next (Multas or Licence), but in the document the needed value not next
then I thought that it is possible to extract the seq and from here extract the value, but I don't understand how
Code:
using System;
using System.Collections.Generic;
using System.Text;
using PdfSharp.Pdf;
using PdfSharp.Pdf.Content;
using PdfSharp.Pdf.Content.Objects;
using PdfSharp.Pdf.IO;
using PdfSharp.Pdf.Advanced;
using System.Linq;
using PdfSharp.Drawing;
using System.Threading.Tasks;
using System.Text.RegularExpressions;
using System.Linq;
using System.Collections;
using PdfSharpTextExtractor;
using System.Xml.Linq;
namespace PdfSharpTextSplitter
{
public class DocumentParts
{
public COperator PointsBeginOp { get; set; }
public COperator PointEndOp { get; set; }
}
/// <summary>
/// Single Threaded pdf text extractor, use multiple instances for multi-threaded app
/// </summary>
public class Splitter : IDisposable
{
public static void PdfToText(string file)
{
using (var _document = PdfReader.Open(file, PdfDocumentOpenMode.ReadOnly))
{
Splitter ext = new Splitter(_document);
var documentParts = new DocumentParts();
foreach (var page in _document.Pages)
{
ext.page = page;
ext.ExtractText(page, documentParts);
}
//Console.Write(result.ToString());
}
}
internal PdfPage page { get; set; }
PdfDocument document { get; }
public Splitter(PdfDocument doc)
{
document = doc;
}
internal class Font
{
PdfDictionary font;
public enum font_type
{
Type0,
Type1,
Type3,
TrueType,
Other
};
Dictionary<ushort, string> toUnicode = new Dictionary<ushort, string>();
public string ToUnicode(ushort val)
{
if (toUnicode != null)
{
string str;
if (toUnicode.TryGetValue(val, out str)) return str;
else
{
Console.WriteLine($"Warning! No unicode symbol for {val}!");
}
}
return "";
}
public font_type Type { get; set; }
public string Encoding { get; set; }
const int Flag_Symbolic = 4;
public int Flags { get; set; }
public bool IsTwoByte { get; }
public Font(PdfDictionary dictionary)
{
font = dictionary;
// font.Elements["SubType"]
// font.Elements["Encoding"]
if (font.Elements.Keys.Contains("/Encoding"))
{
PdfItem item = font.Elements["/Encoding"];
if (item is PdfReference)
{
var dict = (((item as PdfReference).Value) as PdfDictionary);
if (dict.Elements.ContainsKey("/BaseEncoding"))
Encoding = dict.Elements["/BaseEncoding"].ToString();
}
else Encoding = item.ToString();
}
if (Encoding != null && Encoding.StartsWith("/Identity-")) IsTwoByte = true;
if (font.Elements.Keys.Contains("/Subtype"))
{
switch (font.Elements["/Subtype"].ToString())
{
case "/TrueType":
Type = font_type.TrueType;
break;
case "/Type0":
Type = font_type.Type0;
break;
case "/Type1":
Type = font_type.Type1;
break;
case "/Type3":
Type = font_type.Type3;
break;
default:
Type = font_type.Other;
break;
}
}
if (font.Elements.ContainsKey("/FontDescriptor"))
{
var obj = font.Elements["/FontDescriptor"];
if (obj is PdfReference) obj = (obj as PdfReference).Value;
obj = (obj as PdfDictionary).Elements["/Flags"];
if (obj != null)
{
if (obj is PdfReference) obj = (obj as PdfReference).Value;
if (obj is PdfInteger) Flags = (obj as PdfInteger).Value;
}
}
if (font.Elements.Keys.Contains("/ToUnicode"))
{ // parse to unicode
PdfItem item = font.Elements["/ToUnicode"];
if (item is PdfReference) item = (item as PdfReference).Value;
if (item is PdfDictionary)
{
string map = (item as PdfDictionary).Stream.ToString();
toUnicode = ParseCMap(map);
}
}
else
{
if (Encoding != null)
{
switch (Encoding)
{
case "/MacRomanEncoding":
toUnicode = EncodingTables.MacRoman;
break;
case "/WinAnsiEncoding":
toUnicode = EncodingTables.WinAnsi;
break;
case "/MacExpertEncoding":
toUnicode = EncodingTables.MacExpert;
break;
case "/Standard":
toUnicode = EncodingTables.Standard;
break;
case "/Symbol":
toUnicode = EncodingTables.Symbol;
break;
}
}
else
{
if ((Flags & Flag_Symbolic) != 0)
toUnicode = EncodingTables.Symbol;
else
toUnicode = EncodingTables.Standard;
}
}
}
public char nodef_char { get; set; } = '\xE202';
Dictionary<ushort, string> ParseCMap(string map)
{
Dictionary<ushort, string> cmap = new Dictionary<ushort, string>();
try
{
map = map.ToLower();
int bf = map.IndexOf("beginbfrange");
while (bf >= 0)
{
int ef = map.IndexOf("endbfrange", bf);
if (ef < 0) ef = map.Length;
// parsing ranges
string[] Ranges = map.Substring(bf + 13, ef - (bf + 13)).Split('\n', '\r');
foreach (string range in Ranges)
{
Match m = Regex.Match(range, "<([0-9abcdef]+)> <([0-9abcdef]+)> <([0-9abcdef]+)>");
if (m.Success)
{
int st = int.Parse(m.Groups[1].Value, System.Globalization.NumberStyles.HexNumber);
int end = int.Parse(m.Groups[2].Value, System.Globalization.NumberStyles.HexNumber);
char uni = (char)int.Parse(m.Groups[3].Value, System.Globalization.NumberStyles.HexNumber);
end = Math.Min(ushort.MaxValue - 1, end);
st = Math.Min(st, end);
for (ushort q = (ushort)st; q <= end; q++)
cmap[q] = "" + uni++;
continue;
}
m = Regex.Match(range, @"<([0-9abcdef]+)> <([0-9abcdef]+)> \[(.+)\]");
if (m.Success)
{
int st = int.Parse(m.Groups[1].Value, System.Globalization.NumberStyles.HexNumber);
int end = int.Parse(m.Groups[2].Value, System.Globalization.NumberStyles.HexNumber);
end = Math.Min(ushort.MaxValue - 1, end);
st = Math.Min(st, end);
foreach (Match mm in Regex.Matches(m.Groups[3].Value, "<([0-9abcdef]+)>"))
{
if (mm.Groups.Count > 1)
{
cmap[(ushort)st++] = new string(mm.Groups[1].Value.Select((x, i) => new { x, i }).GroupBy(o => o.i / 4).Select(g => new string(g.Select(o => o.x).ToArray()))
.Select(s => (char)int.Parse(s, System.Globalization.NumberStyles.HexNumber)).ToArray());
if (st >= end) break;
}
}
}
}
bf = map.IndexOf("beginbfrange", ef);
}
bf = map.IndexOf("beginbfchar");
while (bf >= 0)
{
int ef = map.IndexOf("endbfchar", bf);
if (ef < 0) ef = map.Length;
// parsing ranges
string[] Ranges = map.Substring(bf + 11, ef - (bf + 11)).Split('\n', '\r');
foreach (string range in Ranges)
{
Match m = Regex.Match(range, "<([0-9abcdef]+)> <([0-9abcdef]+)>");
if (m.Success)
{
int st = int.Parse(m.Groups[1].Value, System.Globalization.NumberStyles.HexNumber);
st = Math.Min(st, ushort.MaxValue - 1);
cmap[(ushort)st] = new string(m.Groups[2].Value.Select((x, i) => new { x, i }).GroupBy(o => o.i / 4).Select(g => new string(g.Select(o => o.x).ToArray()))
.Select(s => (char)int.Parse(s, System.Globalization.NumberStyles.HexNumber)).ToArray());
continue;
}
}
bf = map.IndexOf("beginbfchar", ef);
}
}
catch (Exception e)
{
Console.WriteLine("Error parsing cmap range");
}
return cmap;
}
public string ProcessString(string str)
{
return "";
}
}
Font curr_font;
Dictionary<string, Font> fonts = new Dictionary<string, Font>();
public bool ExtractText(PdfPage page, DocumentParts parts)
{
// create font table
curr_font = null; // no default font
this.page = page;
fonts.Clear();
if ((page.Resources != null) && (page.Resources.Elements["/Font"] != null))
{
var obj = page.Resources.Elements["/Font"];
if (obj is PdfReference) obj = (obj as PdfReference).Value;
if (obj is PdfDictionary)
foreach (var kp in (obj as PdfDictionary).Elements)
{
PdfItem fobj = kp.Value;
if (fobj is PdfReference)
{
fobj = ((PdfReference)fobj).Value;
}
// now we make font
if (fobj is PdfDictionary)
{
fonts.Add(kp.Key, new Font((PdfDictionary)fobj));
}
}
}
try
{
CSequence seq = ContentReader.ReadContent(page);
ExtractText(seq, parts);
}
catch (Exception e)
{
Console.WriteLine("Error: " + e.Message);
return false;
}
return true;
}
bool ExtractedSection(CObject obj, COperator op, DocumentParts parts)
{
switch (obj)
{
case CString str when str.Value.Contains("Points"):
parts.PointsBeginOp = op;
Console.WriteLine(str.Value);
return true;
case CString str when str.Value.Contains("Multas"):
parts.PointEndOp = op;
Console.WriteLine(str.Value);
return true;
case CString str:
Console.WriteLine(str.Value);
return false;
default: return false;
};
}
void ExtractText(CObject obj, DocumentParts parts)
{
if (obj is CArray)
ExtractText((CArray)obj, parts);
/* else if (obj is CComment)
ExtractText((CComment)obj, target);
else if (obj is CInteger)
ExtractText((CInteger)obj, target);
else if (obj is CName)
ExtractText((CName)obj, target);
else if (obj is CNumber)
ExtractText((CNumber)obj, target);
*/
else if (obj is COperator op)
ExtractText(op, parts);
/*
else if (obj is CReal)
ExtractText((CReal)obj, target);
*/
else if (obj is CSequence)
ExtractText((CSequence)obj, parts);
//else if (obj is CString)
// ExtractText((CString)obj, parts);
/*else
throw new NotImplementedException(obj.GetType().AssemblyQualifiedName);
*/
}
private void ExtractText(CArray array, DocumentParts parts)
{
foreach (var element in array)
{
ExtractText(element, parts);
}
}
private void ExtractText(COperator op, DocumentParts parts)
{
if (op.OpCode.OpCodeName == OpCodeName.QuoteSingle || op.OpCode.OpCodeName == OpCodeName.QuoteDbl || op.OpCode.OpCodeName == OpCodeName.Tj || op.OpCode.OpCodeName == OpCodeName.TJ)
{
if (op.OpCode.OpCodeName == OpCodeName.QuoteSingle || op.OpCode.OpCodeName == OpCodeName.QuoteDbl)
//target.Append("\n");
Console.WriteLine();
if (op.Operands.Count == 1)
{
if (op.Operands[0] is CArray array)
{
foreach (var elem in array)
{
if (elem is CString)
{
ExtractText(elem as CString, parts);
}
else
{
if ((elem is CNumber) && (op.OpCode.OpCodeName == OpCodeName.Tj))
if (GetNumberValue((CNumber)elem) > 750)
{
//target.Append(" ");
Console.WriteLine();
}
}
}
}
else
{
CObject elem = op.Operands[0];
ExtractedSection(elem, op, parts);
ExtractText(elem, parts);
}
}
else
Console.WriteLine("Error TJ!");
}
else
if ((op.OpCode.OpCodeName == OpCodeName.Tx) || (op.OpCode.OpCodeName == OpCodeName.TD) || (op.OpCode.OpCodeName == OpCodeName.Td))
{
//target.Append("\n");
Console.WriteLine();
}
else
if (op.OpCode.OpCodeName == OpCodeName.Tm)
{
// TODO: check if position shifts enough (sometimes Tm is used in word parts)
//target.Append(" ");
Console.WriteLine();
}
else
if (op.OpCode.OpCodeName == OpCodeName.Tf)
{
if (op.Operands.Count == 2)
{
//if (obj.Operands[0] is CString)
{
string nF = op.Operands[0].ToString();
curr_font = fonts[nF];
}
}
else
{
Console.WriteLine("Error in Tf operator");
}
}
}
double GetNumberValue(CNumber numb)
{
if (numb is CReal) return ((CReal)numb).Value;
else
if (numb is CInteger) return ((CInteger)numb).Value;
else return double.NaN;
}
//private void ExtractText(CString elem, DocumentParts parts)
//{
// if (curr_font.IsTwoByte)
// {
// foreach (var s in elem.Value.Select((c, i) => new { c = c << (1 - i % 2) * 8, i }).GroupBy(o => o.i / 2).Select(g => (char)g.Sum(o => o.c))
// .Select(c => curr_font.ToUnicode(c)))
// target.Append(s);
// }
// else
// foreach (var s in elem.Value.Select(c => curr_font.ToUnicode(c))) target.Append(s);
//}
private void ExtractText(CSequence seq, DocumentParts parts)
{
foreach (var element in seq)
{
ExtractText(element, parts);
}
}
public virtual void Dispose()
{
fonts.Clear();
curr_font = null;
}
}
}