update
This commit is contained in:
Binary file not shown.
@@ -0,0 +1,10 @@
|
||||
{
|
||||
"version": "0.2.1",
|
||||
"tasks": [
|
||||
{
|
||||
"taskLabel": "task-sRGB",
|
||||
"appliesTo": "PDFWorkflowManager/PDFWorkflowManager/sRGB.icc",
|
||||
"type": "launch"
|
||||
}
|
||||
]
|
||||
}
|
||||
@@ -0,0 +1,18 @@
|
||||
using PdfSharp.Fonts;
|
||||
using System.IO;
|
||||
|
||||
public class SystemFontResolver : IFontResolver
|
||||
{
|
||||
public FontResolverInfo ResolveTypeface(string familyName, bool isBold, bool isItalic)
|
||||
{
|
||||
// Map any requested font to Arial
|
||||
return new FontResolverInfo("Arial#");
|
||||
}
|
||||
|
||||
public byte[] GetFont(string faceName)
|
||||
{
|
||||
// Use system fonts directory
|
||||
string fontPath = Path.Combine(System.Environment.GetFolderPath(System.Environment.SpecialFolder.Fonts), "arial.ttf");
|
||||
return File.ReadAllBytes(fontPath);
|
||||
}
|
||||
}
|
||||
@@ -1,146 +1,173 @@
|
||||
using System;
|
||||
using System.IO;
|
||||
using System.Drawing;
|
||||
using BitMiracle.LibTiff.Classic;
|
||||
using System.Threading.Tasks;
|
||||
using System.Windows.Forms;
|
||||
using PdfSharp.Pdf;
|
||||
using PdfSharp.Drawing;
|
||||
using Tesseract;
|
||||
using System.Windows.Forms;
|
||||
using System.Linq;
|
||||
using ImageMagick;
|
||||
|
||||
namespace RCEU_PDFWorkflowManager
|
||||
{
|
||||
public class TiffToPdfConverter
|
||||
{
|
||||
public void ConvertTiffToPdfAWithOcr(
|
||||
/// <summary>
|
||||
/// Maximum number of threads to use for page processing
|
||||
/// </summary>
|
||||
public int MaxDegreeOfParallelism { get; set; } = Environment.ProcessorCount;
|
||||
|
||||
/// <summary>
|
||||
/// Converts all TIFF files in a directory to a searchable PDF.
|
||||
/// </summary>
|
||||
public async Task ConvertTiffToPdfWithOcrAsync(
|
||||
string workOutDir,
|
||||
string outputPdfPath,
|
||||
string ocrLanguage,
|
||||
string outputPdfFileName, // e.g., "output.pdf"
|
||||
string selectedLanguage,
|
||||
ToolStripProgressBar progressBar,
|
||||
ToolStripStatusLabel statusLabel)
|
||||
ToolStripStatusLabel statusLabel,
|
||||
Form mainForm,
|
||||
string bannerPdfPath = null) // optional banner PDF
|
||||
{
|
||||
string[] tiffFiles = Directory.GetFiles(workOutDir, "*.tif");
|
||||
|
||||
if (tiffFiles.Length == 0)
|
||||
throw new FileNotFoundException("No TIFF files found in the directory.");
|
||||
|
||||
// Count total pages for progress bar
|
||||
int totalPages = tiffFiles.Sum(file => Tiff.Open(file, "r")?.NumberOfDirectories() ?? 0);
|
||||
progressBar.Maximum = totalPages;
|
||||
progressBar.Value = 0;
|
||||
statusLabel.Text = "Converting TIFFs to PDF...";
|
||||
int totalPages = 0;
|
||||
foreach (var file in tiffFiles)
|
||||
using (var collection = new MagickImageCollection(file))
|
||||
totalPages += collection.Count;
|
||||
|
||||
PdfDocument pdf = new PdfDocument();
|
||||
pdf.Info.Title = "Converted TIFF to PDF/A";
|
||||
pdf.Info.Creator = "RCEU_PDFWorkflowManager";
|
||||
|
||||
foreach (var tiffFile in tiffFiles)
|
||||
mainForm.Invoke((MethodInvoker)(() =>
|
||||
{
|
||||
using (Tiff image = Tiff.Open(tiffFile, "r"))
|
||||
progressBar.Maximum = totalPages;
|
||||
progressBar.Value = 0;
|
||||
statusLabel.Text = "Converting TIFFs to PDF...";
|
||||
}));
|
||||
|
||||
bool success = true;
|
||||
|
||||
try
|
||||
{
|
||||
await Task.Run(() =>
|
||||
{
|
||||
if (image == null) continue;
|
||||
|
||||
int pageCount = image.NumberOfDirectories();
|
||||
for (int pageIndex = 0; pageIndex < pageCount; pageIndex++)
|
||||
foreach (var tiffFile in tiffFiles)
|
||||
{
|
||||
image.SetDirectory((short)pageIndex);
|
||||
|
||||
int width = image.GetField(TiffTag.IMAGEWIDTH)[0].ToInt();
|
||||
int height = image.GetField(TiffTag.IMAGELENGTH)[0].ToInt();
|
||||
|
||||
int[] raster = new int[width * height];
|
||||
image.ReadRGBAImage(width, height, raster);
|
||||
|
||||
using (var bmp = new Bitmap(width, height, System.Drawing.Imaging.PixelFormat.Format32bppArgb))
|
||||
using (var collection = new MagickImageCollection(tiffFile))
|
||||
{
|
||||
var bmpData = bmp.LockBits(
|
||||
new Rectangle(0, 0, width, height),
|
||||
System.Drawing.Imaging.ImageLockMode.WriteOnly,
|
||||
bmp.PixelFormat);
|
||||
|
||||
System.Runtime.InteropServices.Marshal.Copy(raster, 0, bmpData.Scan0, raster.Length);
|
||||
bmp.UnlockBits(bmpData);
|
||||
|
||||
PdfPage pagePdf = pdf.AddPage();
|
||||
pagePdf.Width = XUnit.FromPoint(width);
|
||||
pagePdf.Height = XUnit.FromPoint(height);
|
||||
|
||||
using (XGraphics gfx = XGraphics.FromPdfPage(pagePdf))
|
||||
Parallel.ForEach(collection, new ParallelOptions { MaxDegreeOfParallelism = MaxDegreeOfParallelism }, magickImage =>
|
||||
{
|
||||
// Save temp PNG
|
||||
string tempPath = Path.Combine(Path.GetTempPath(), Guid.NewGuid() + ".png");
|
||||
bmp.Save(tempPath, System.Drawing.Imaging.ImageFormat.Png);
|
||||
|
||||
using (XImage ximg = XImage.FromFile(tempPath))
|
||||
using (Bitmap bmp = magickImage.ToBitmap())
|
||||
{
|
||||
gfx.DrawImage(ximg, 0, 0, width, height);
|
||||
string extractedText = "";
|
||||
try
|
||||
{
|
||||
extractedText = PerformOcr(bmp, selectedLanguage);
|
||||
}
|
||||
catch
|
||||
{
|
||||
// If OCR fails, continue but mark success false
|
||||
success = false;
|
||||
}
|
||||
|
||||
lock (pdf)
|
||||
{
|
||||
PdfPage page = pdf.AddPage();
|
||||
page.Width = XUnit.FromPoint(bmp.Width).Point;
|
||||
page.Height = XUnit.FromPoint(bmp.Height).Point;
|
||||
|
||||
using (var gfx = XGraphics.FromPdfPage(page))
|
||||
using (var ms = new MemoryStream())
|
||||
{
|
||||
bmp.Save(ms, System.Drawing.Imaging.ImageFormat.Png);
|
||||
ms.Position = 0;
|
||||
using (var ximg = XImage.FromStream(ms))
|
||||
gfx.DrawImage(ximg,
|
||||
XUnit.FromPoint(0),
|
||||
XUnit.FromPoint(0),
|
||||
XUnit.FromPoint(bmp.Width),
|
||||
XUnit.FromPoint(bmp.Height));
|
||||
}
|
||||
|
||||
OverlayTextOntoPdfPage(page, extractedText);
|
||||
}
|
||||
|
||||
mainForm.Invoke((MethodInvoker)(() =>
|
||||
{
|
||||
progressBar.Value++;
|
||||
statusLabel.Text = $"Processing page {progressBar.Value} of {progressBar.Maximum}";
|
||||
}));
|
||||
}
|
||||
|
||||
File.Delete(tempPath);
|
||||
}
|
||||
|
||||
// OCR and overlay text
|
||||
string extractedText = PerformOcr(tiffFile, pageIndex, ocrLanguage);
|
||||
OverlayTextOntoPdfPage(pdf, pagePdf, extractedText);
|
||||
|
||||
// Update progress bar safely
|
||||
if (progressBar.InvokeRequired)
|
||||
{
|
||||
progressBar.Invoke((MethodInvoker)delegate
|
||||
{
|
||||
progressBar.Value++;
|
||||
});
|
||||
}
|
||||
else
|
||||
{
|
||||
progressBar.Value++;
|
||||
}
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
pdf.Save(outputPdfPath);
|
||||
statusLabel.Text = "Conversion complete!";
|
||||
}
|
||||
|
||||
// OCR method
|
||||
private string PerformOcr(string tiffFile, int pageIndex, string language)
|
||||
{
|
||||
using (Tiff tiff = Tiff.Open(tiffFile, "r"))
|
||||
{
|
||||
tiff.SetDirectory((short)pageIndex);
|
||||
int width = tiff.GetField(TiffTag.IMAGEWIDTH)[0].ToInt();
|
||||
int height = tiff.GetField(TiffTag.IMAGELENGTH)[0].ToInt();
|
||||
|
||||
int[] raster = new int[width * height];
|
||||
tiff.ReadRGBAImage(width, height, raster);
|
||||
|
||||
using (Bitmap bmp = new Bitmap(width, height, System.Drawing.Imaging.PixelFormat.Format32bppArgb))
|
||||
{
|
||||
for (int y = 0; y < height; y++)
|
||||
for (int x = 0; x < width; x++)
|
||||
{
|
||||
int rgba = raster[y * width + x];
|
||||
int r = rgba & 0xFF;
|
||||
int g = (rgba >> 8) & 0xFF;
|
||||
int b = (rgba >> 16) & 0xFF;
|
||||
bmp.SetPixel(x, height - y - 1, Color.FromArgb(255, r, g, b));
|
||||
}
|
||||
|
||||
using (var engine = new TesseractEngine(@"./tessdata", language, EngineMode.Default))
|
||||
using (var page = engine.Process(bmp))
|
||||
// Add banner page if selected
|
||||
if (!string.IsNullOrEmpty(bannerPdfPath) && File.Exists(bannerPdfPath))
|
||||
{
|
||||
return page.GetText();
|
||||
lock (pdf)
|
||||
{
|
||||
using (PdfDocument bannerDoc = PdfSharp.Pdf.IO.PdfReader.Open(bannerPdfPath, PdfSharp.Pdf.IO.PdfDocumentOpenMode.Import))
|
||||
{
|
||||
foreach (PdfPage bannerPage in bannerDoc.Pages)
|
||||
{
|
||||
pdf.AddPage(bannerPage);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// Only save if all pages processed successfully
|
||||
if (success)
|
||||
{
|
||||
string outputPdfPath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, outputPdfFileName);
|
||||
pdf.Save(outputPdfPath);
|
||||
mainForm.Invoke((MethodInvoker)(() => statusLabel.Text = $"PDF saved: {outputPdfPath}"));
|
||||
}
|
||||
else
|
||||
{
|
||||
mainForm.Invoke((MethodInvoker)(() => statusLabel.Text = "Conversion incomplete, PDF not saved."));
|
||||
}
|
||||
}
|
||||
catch (Exception ex)
|
||||
{
|
||||
mainForm.Invoke((MethodInvoker)(() => statusLabel.Text = $"Error during conversion: {ex.Message}"));
|
||||
}
|
||||
}
|
||||
|
||||
// Overlay OCR text (invisible but searchable)
|
||||
private void OverlayTextOntoPdfPage(PdfDocument pdf, PdfPage page, string text)
|
||||
private string PerformOcr(Bitmap bmp, string language)
|
||||
{
|
||||
using (XGraphics gfx = XGraphics.FromPdfPage(page, XGraphicsPdfPageOptions.Prepend))
|
||||
string tessDataPath = Path.Combine(AppDomain.CurrentDomain.BaseDirectory, "tessdata");
|
||||
|
||||
if (!Directory.Exists(tessDataPath))
|
||||
throw new DirectoryNotFoundException($"Tessdata folder not found: {tessDataPath}");
|
||||
|
||||
using (var engine = new TesseractEngine(tessDataPath, language, EngineMode.Default))
|
||||
using (var page = engine.Process(bmp))
|
||||
{
|
||||
XFont font = new XFont("Arial", 10); // regular font
|
||||
gfx.DrawString(text, font, XBrushes.Transparent, new XRect(0, 0, page.Width, page.Height));
|
||||
return page.GetText();
|
||||
}
|
||||
}
|
||||
|
||||
private void OverlayTextOntoPdfPage(PdfPage page, string text)
|
||||
{
|
||||
using (var gfx = XGraphics.FromPdfPage(page, XGraphicsPdfPageOptions.Prepend))
|
||||
{
|
||||
XFont font = new XFont("Arial", 10);
|
||||
gfx.DrawString(text, font, XBrushes.Transparent,
|
||||
new XRect(
|
||||
XUnit.FromPoint(0),
|
||||
XUnit.FromPoint(0),
|
||||
XUnit.FromPoint(page.Width),
|
||||
XUnit.FromPoint(page.Height)),
|
||||
XStringFormats.TopLeft);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1 @@
|
||||
tessedit_create_alto 1
|
||||
@@ -0,0 +1,7 @@
|
||||
tessedit_ambigs_training 1
|
||||
load_freq_dawg 0
|
||||
load_punc_dawg 0
|
||||
load_system_dawg 0
|
||||
load_number_dawg 0
|
||||
ambigs_debug_level 3
|
||||
load_fixed_length_dawgs 0
|
||||
@@ -0,0 +1 @@
|
||||
tessedit_zero_rejection T
|
||||
@@ -0,0 +1,5 @@
|
||||
load_bigram_dawg True
|
||||
tessedit_enable_bigram_correction True
|
||||
tessedit_bigram_debug 3
|
||||
save_raw_choices True
|
||||
save_alt_choices True
|
||||
@@ -0,0 +1,12 @@
|
||||
disable_character_fragments T
|
||||
file_type .bl
|
||||
textord_fast_pitch_test T
|
||||
tessedit_zero_rejection T
|
||||
tessedit_minimal_rejection F
|
||||
tessedit_write_rep_codes F
|
||||
edges_children_fix F
|
||||
edges_childarea 0.65
|
||||
edges_boxarea 0.9
|
||||
tessedit_resegment_from_boxes T
|
||||
tessedit_train_from_boxes T
|
||||
textord_no_rejects T
|
||||
@@ -0,0 +1,13 @@
|
||||
file_type .bl
|
||||
#tessedit_use_nn F
|
||||
textord_fast_pitch_test T
|
||||
tessedit_zero_rejection T
|
||||
tessedit_minimal_rejection F
|
||||
tessedit_write_rep_codes F
|
||||
edges_children_fix F
|
||||
edges_childarea 0.65
|
||||
edges_boxarea 0.9
|
||||
tessedit_resegment_from_boxes T
|
||||
tessedit_train_from_boxes T
|
||||
#textord_repeat_extraction F
|
||||
textord_no_rejects T
|
||||
@@ -0,0 +1 @@
|
||||
tessedit_char_whitelist 0123456789-.
|
||||
@@ -0,0 +1 @@
|
||||
tessedit_write_images T
|
||||
@@ -0,0 +1,2 @@
|
||||
tessedit_create_hocr 1
|
||||
hocr_font_info 0
|
||||
@@ -0,0 +1,2 @@
|
||||
interactive_display_mode T
|
||||
tessedit_display_outwords T
|
||||
@@ -0,0 +1,4 @@
|
||||
textord_skewsmooth_offset 8
|
||||
textord_skewsmooth_offset2 8
|
||||
textord_merge_desc 0.5
|
||||
textord_no_rejects 1
|
||||
@@ -0,0 +1,2 @@
|
||||
tessedit_resegment_from_line_boxes 1
|
||||
tessedit_make_boxes_from_boxes 1
|
||||
@@ -0,0 +1 @@
|
||||
debug_file tesseract.log
|
||||
@@ -0,0 +1,11 @@
|
||||
file_type .bl
|
||||
textord_fast_pitch_test T
|
||||
tessedit_zero_rejection T
|
||||
tessedit_minimal_rejection F
|
||||
tessedit_write_rep_codes F
|
||||
edges_children_fix F
|
||||
edges_childarea 0.65
|
||||
edges_boxarea 0.9
|
||||
tessedit_train_line_recognizer T
|
||||
textord_no_rejects T
|
||||
tessedit_init_config_only T
|
||||
@@ -0,0 +1 @@
|
||||
tessedit_create_lstmbox 1
|
||||
@@ -0,0 +1,4 @@
|
||||
stopper_debug_level 1
|
||||
classify_debug_level 1
|
||||
segsearch_debug_level 1
|
||||
language_model_debug_level 3
|
||||
@@ -0,0 +1 @@
|
||||
tessedit_create_boxfile 1
|
||||
@@ -0,0 +1,3 @@
|
||||
tessedit_create_page_xml 1
|
||||
# page_xml_polygon 1
|
||||
# page_xml_level 0
|
||||
@@ -0,0 +1 @@
|
||||
tessedit_create_pdf 1
|
||||
@@ -0,0 +1 @@
|
||||
debug_file /dev/null
|
||||
@@ -0,0 +1,2 @@
|
||||
tessedit_resegment_from_boxes 1
|
||||
tessedit_make_boxes_from_boxes 1
|
||||
@@ -0,0 +1,12 @@
|
||||
textord_show_blobs 0
|
||||
textord_debug_tabfind 3
|
||||
textord_tabfind_show_partitions 1
|
||||
textord_tabfind_show_initial_partitions 1
|
||||
textord_tabfind_show_columns 1
|
||||
textord_tabfind_show_blocks 1
|
||||
textord_tabfind_show_initialtabs 1
|
||||
textord_tabfind_show_finaltabs 1
|
||||
textord_tabfind_show_strokewidths 1
|
||||
textord_tabfind_show_vlines 0
|
||||
textord_tabfind_show_images 1
|
||||
tessedit_dump_pageseg_images 0
|
||||
@@ -0,0 +1 @@
|
||||
tessedit_create_tsv 1
|
||||
@@ -0,0 +1,3 @@
|
||||
# This config file should be used with other config files which create renderers.
|
||||
# usage example: tesseract eurotext.tif eurotext txt hocr pdf
|
||||
tessedit_create_txt 1
|
||||
@@ -0,0 +1,2 @@
|
||||
tessedit_write_unlv 1
|
||||
unlv_tilde_crunching T
|
||||
@@ -0,0 +1 @@
|
||||
tessedit_create_wordstrbox 1
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user