This commit is contained in:
tomse
2025-09-07 23:12:14 +02:00
parent 4b56ee8c2b
commit 7a5ddf19ba
165 changed files with 285 additions and 103 deletions

View File

@@ -0,0 +1,18 @@
using PdfSharp.Fonts;
using System.IO;
public class SystemFontResolver : IFontResolver
{
public FontResolverInfo ResolveTypeface(string familyName, bool isBold, bool isItalic)
{
// Map any requested font to Arial
return new FontResolverInfo("Arial#");
}
public byte[] GetFont(string faceName)
{
// Use system fonts directory
string fontPath = Path.Combine(System.Environment.GetFolderPath(System.Environment.SpecialFolder.Fonts), "arial.ttf");
return File.ReadAllBytes(fontPath);
}
}

View File

@@ -1,146 +1,173 @@
using System;
using System.IO;
using System.Drawing;
using BitMiracle.LibTiff.Classic;
using System.Threading.Tasks;
using System.Windows.Forms;
using PdfSharp.Pdf;
using PdfSharp.Drawing;
using Tesseract;
using System.Windows.Forms;
using System.Linq;
using ImageMagick;
namespace RCEU_PDFWorkflowManager
{
public class TiffToPdfConverter
{
public void ConvertTiffToPdfAWithOcr(
/// <summary>
/// Maximum number of threads to use for page processing
/// </summary>
public int MaxDegreeOfParallelism { get; set; } = Environment.ProcessorCount;
/// <summary>
/// Converts all TIFF files in a directory to a searchable PDF.
/// </summary>
public async Task ConvertTiffToPdfWithOcrAsync(
string workOutDir,
string outputPdfPath,
string ocrLanguage,
string outputPdfFileName, // e.g., "output.pdf"
string selectedLanguage,
ToolStripProgressBar progressBar,
ToolStripStatusLabel statusLabel)
ToolStripStatusLabel statusLabel,
Form mainForm,
string bannerPdfPath = null) // optional banner PDF
{
string[] tiffFiles = Directory.GetFiles(workOutDir, "*.tif");
if (tiffFiles.Length == 0)
throw new FileNotFoundException("No TIFF files found in the directory.");
// Count total pages for progress bar
int totalPages = tiffFiles.Sum(file => Tiff.Open(file, "r")?.NumberOfDirectories() ?? 0);
progressBar.Maximum = totalPages;
progressBar.Value = 0;
statusLabel.Text = "Converting TIFFs to PDF...";
int totalPages = 0;
foreach (var file in tiffFiles)
using (var collection = new MagickImageCollection(file))
totalPages += collection.Count;
PdfDocument pdf = new PdfDocument();
pdf.Info.Title = "Converted TIFF to PDF/A";
pdf.Info.Creator = "RCEU_PDFWorkflowManager";
foreach (var tiffFile in tiffFiles)
mainForm.Invoke((MethodInvoker)(() =>
{
using (Tiff image = Tiff.Open(tiffFile, "r"))
progressBar.Maximum = totalPages;
progressBar.Value = 0;
statusLabel.Text = "Converting TIFFs to PDF...";
}));
bool success = true;
try
{
await Task.Run(() =>
{
if (image == null) continue;
int pageCount = image.NumberOfDirectories();
for (int pageIndex = 0; pageIndex < pageCount; pageIndex++)
foreach (var tiffFile in tiffFiles)
{
image.SetDirectory((short)pageIndex);
int width = image.GetField(TiffTag.IMAGEWIDTH)[0].ToInt();
int height = image.GetField(TiffTag.IMAGELENGTH)[0].ToInt();
int[] raster = new int[width * height];
image.ReadRGBAImage(width, height, raster);
using (var bmp = new Bitmap(width, height, System.Drawing.Imaging.PixelFormat.Format32bppArgb))
using (var collection = new MagickImageCollection(tiffFile))
{
var bmpData = bmp.LockBits(
new Rectangle(0, 0, width, height),
System.Drawing.Imaging.ImageLockMode.WriteOnly,
bmp.PixelFormat);
System.Runtime.InteropServices.Marshal.Copy(raster, 0, bmpData.Scan0, raster.Length);
bmp.UnlockBits(bmpData);
PdfPage pagePdf = pdf.AddPage();
pagePdf.Width = XUnit.FromPoint(width);
pagePdf.Height = XUnit.FromPoint(height);
using (XGraphics gfx = XGraphics.FromPdfPage(pagePdf))
Parallel.ForEach(collection, new ParallelOptions { MaxDegreeOfParallelism = MaxDegreeOfParallelism }, magickImage =>
{
// Save temp PNG
string tempPath = Path.Combine(Path.GetTempPath(), Guid.NewGuid() + ".png");
bmp.Save(tempPath, System.Drawing.Imaging.ImageFormat.Png);
using (XImage ximg = XImage.FromFile(tempPath))
using (Bitmap bmp = magickImage.ToBitmap())
{
gfx.DrawImage(ximg, 0, 0, width, height);
string extractedText = "";
try
{
extractedText = PerformOcr(bmp, selectedLanguage);
}
catch
{
// If OCR fails, continue but mark success false
success = false;
}
lock (pdf)
{
PdfPage page = pdf.AddPage();
page.Width = XUnit.FromPoint(bmp.Width).Point;
page.Height = XUnit.FromPoint(bmp.Height).Point;
using (var gfx = XGraphics.FromPdfPage(page))
using (var ms = new MemoryStream())
{
bmp.Save(ms, System.Drawing.Imaging.ImageFormat.Png);
ms.Position = 0;
using (var ximg = XImage.FromStream(ms))
gfx.DrawImage(ximg,
XUnit.FromPoint(0),
XUnit.FromPoint(0),
XUnit.FromPoint(bmp.Width),
XUnit.FromPoint(bmp.Height));
}
OverlayTextOntoPdfPage(page, extractedText);
}
mainForm.Invoke((MethodInvoker)(() =>
{
progressBar.Value++;
statusLabel.Text = $"Processing page {progressBar.Value} of {progressBar.Maximum}";
}));
}
File.Delete(tempPath);
}
// OCR and overlay text
string extractedText = PerformOcr(tiffFile, pageIndex, ocrLanguage);
OverlayTextOntoPdfPage(pdf, pagePdf, extractedText);
// Update progress bar safely
if (progressBar.InvokeRequired)
{
progressBar.Invoke((MethodInvoker)delegate
{
progressBar.Value++;
});
}
else
{
progressBar.Value++;
}
});
}
}
}
}
pdf.Save(outputPdfPath);
statusLabel.Text = "Conversion complete!";
}
// OCR method
private string PerformOcr(string tiffFile, int pageIndex, string language)
{
using (Tiff tiff = Tiff.Open(tiffFile, "r"))
{
tiff.SetDirectory((short)pageIndex);
int width = tiff.GetField(TiffTag.IMAGEWIDTH)[0].ToInt();
int height = tiff.GetField(TiffTag.IMAGELENGTH)[0].ToInt();
int[] raster = new int[width * height];
tiff.ReadRGBAImage(width, height, raster);
using (Bitmap bmp = new Bitmap(width, height, System.Drawing.Imaging.PixelFormat.Format32bppArgb))
{
for (int y = 0; y < height; y++)
for (int x = 0; x < width; x++)
{
int rgba = raster[y * width + x];
int r = rgba & 0xFF;
int g = (rgba >> 8) & 0xFF;
int b = (rgba >> 16) & 0xFF;
bmp.SetPixel(x, height - y - 1, Color.FromArgb(255, r, g, b));
}
using (var engine = new TesseractEngine(@"./tessdata", language, EngineMode.Default))
using (var page = engine.Process(bmp))
// Add banner page if selected
if (!string.IsNullOrEmpty(bannerPdfPath) && File.Exists(bannerPdfPath))
{
return page.GetText();
lock (pdf)
{
using (PdfDocument bannerDoc = PdfSharp.Pdf.IO.PdfReader.Open(bannerPdfPath, PdfSharp.Pdf.IO.PdfDocumentOpenMode.Import))
{
foreach (PdfPage bannerPage in bannerDoc.Pages)
{
pdf.AddPage(bannerPage);
}
}
}
}
});
// Only save if all pages processed successfully
if (success)
{
string outputPdfPath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, outputPdfFileName);
pdf.Save(outputPdfPath);
mainForm.Invoke((MethodInvoker)(() => statusLabel.Text = $"PDF saved: {outputPdfPath}"));
}
else
{
mainForm.Invoke((MethodInvoker)(() => statusLabel.Text = "Conversion incomplete, PDF not saved."));
}
}
catch (Exception ex)
{
mainForm.Invoke((MethodInvoker)(() => statusLabel.Text = $"Error during conversion: {ex.Message}"));
}
}
// Overlay OCR text (invisible but searchable)
private void OverlayTextOntoPdfPage(PdfDocument pdf, PdfPage page, string text)
private string PerformOcr(Bitmap bmp, string language)
{
using (XGraphics gfx = XGraphics.FromPdfPage(page, XGraphicsPdfPageOptions.Prepend))
string tessDataPath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "tessdata");
if (!Directory.Exists(tessDataPath))
throw new DirectoryNotFoundException($"Tessdata folder not found: {tessDataPath}");
using (var engine = new TesseractEngine(tessDataPath, language, EngineMode.Default))
using (var page = engine.Process(bmp))
{
XFont font = new XFont("Arial", 10); // regular font
gfx.DrawString(text, font, XBrushes.Transparent, new XRect(0, 0, page.Width, page.Height));
return page.GetText();
}
}
private void OverlayTextOntoPdfPage(PdfPage page, string text)
{
using (var gfx = XGraphics.FromPdfPage(page, XGraphicsPdfPageOptions.Prepend))
{
XFont font = new XFont("Arial", 10);
gfx.DrawString(text, font, XBrushes.Transparent,
new XRect(
XUnit.FromPoint(0),
XUnit.FromPoint(0),
XUnit.FromPoint(page.Width),
XUnit.FromPoint(page.Height)),
XStringFormats.TopLeft);
}
}
}

View File

@@ -0,0 +1 @@
tessedit_create_alto 1

View File

@@ -0,0 +1,7 @@
tessedit_ambigs_training 1
load_freq_dawg 0
load_punc_dawg 0
load_system_dawg 0
load_number_dawg 0
ambigs_debug_level 3
load_fixed_length_dawgs 0

View File

@@ -0,0 +1 @@
tessedit_zero_rejection T

View File

@@ -0,0 +1,5 @@
load_bigram_dawg True
tessedit_enable_bigram_correction True
tessedit_bigram_debug 3
save_raw_choices True
save_alt_choices True

View File

@@ -0,0 +1,12 @@
disable_character_fragments T
file_type .bl
textord_fast_pitch_test T
tessedit_zero_rejection T
tessedit_minimal_rejection F
tessedit_write_rep_codes F
edges_children_fix F
edges_childarea 0.65
edges_boxarea 0.9
tessedit_resegment_from_boxes T
tessedit_train_from_boxes T
textord_no_rejects T

View File

@@ -0,0 +1,13 @@
file_type .bl
#tessedit_use_nn F
textord_fast_pitch_test T
tessedit_zero_rejection T
tessedit_minimal_rejection F
tessedit_write_rep_codes F
edges_children_fix F
edges_childarea 0.65
edges_boxarea 0.9
tessedit_resegment_from_boxes T
tessedit_train_from_boxes T
#textord_repeat_extraction F
textord_no_rejects T

View File

@@ -0,0 +1 @@
tessedit_char_whitelist 0123456789-.

View File

@@ -0,0 +1 @@
tessedit_write_images T

View File

@@ -0,0 +1,2 @@
tessedit_create_hocr 1
hocr_font_info 0

View File

@@ -0,0 +1,2 @@
interactive_display_mode T
tessedit_display_outwords T

View File

@@ -0,0 +1,4 @@
textord_skewsmooth_offset 8
textord_skewsmooth_offset2 8
textord_merge_desc 0.5
textord_no_rejects 1

View File

@@ -0,0 +1,2 @@
tessedit_resegment_from_line_boxes 1
tessedit_make_boxes_from_boxes 1

View File

@@ -0,0 +1 @@
debug_file tesseract.log

View File

@@ -0,0 +1,11 @@
file_type .bl
textord_fast_pitch_test T
tessedit_zero_rejection T
tessedit_minimal_rejection F
tessedit_write_rep_codes F
edges_children_fix F
edges_childarea 0.65
edges_boxarea 0.9
tessedit_train_line_recognizer T
textord_no_rejects T
tessedit_init_config_only T

View File

@@ -0,0 +1 @@
tessedit_create_lstmbox 1

View File

@@ -0,0 +1,4 @@
stopper_debug_level 1
classify_debug_level 1
segsearch_debug_level 1
language_model_debug_level 3

View File

@@ -0,0 +1 @@
tessedit_create_boxfile 1

View File

@@ -0,0 +1,3 @@
tessedit_create_page_xml 1
# page_xml_polygon 1
# page_xml_level 0

View File

@@ -0,0 +1 @@
tessedit_create_pdf 1

View File

@@ -0,0 +1 @@
debug_file /dev/null

View File

@@ -0,0 +1,2 @@
tessedit_resegment_from_boxes 1
tessedit_make_boxes_from_boxes 1

View File

@@ -0,0 +1,12 @@
textord_show_blobs 0
textord_debug_tabfind 3
textord_tabfind_show_partitions 1
textord_tabfind_show_initial_partitions 1
textord_tabfind_show_columns 1
textord_tabfind_show_blocks 1
textord_tabfind_show_initialtabs 1
textord_tabfind_show_finaltabs 1
textord_tabfind_show_strokewidths 1
textord_tabfind_show_vlines 0
textord_tabfind_show_images 1
tessedit_dump_pageseg_images 0

View File

@@ -0,0 +1 @@
tessedit_create_tsv 1

View File

@@ -0,0 +1,3 @@
# This config file should be used with other config files which create renderers.
# usage example: tesseract eurotext.tif eurotext txt hocr pdf
tessedit_create_txt 1

View File

@@ -0,0 +1,2 @@
tessedit_write_unlv 1
unlv_tilde_crunching T

View File

@@ -0,0 +1 @@
tessedit_create_wordstrbox 1

Some files were not shown because too many files have changed in this diff Show More