using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
using UglyToad.PdfPig;
using UglyToad.PdfPig.Content;
using OfficeOpenXml;
class Program
{
static void Main(string[] args)
{
string pdfPath = "input.pdf";
string excelPath = "output.xlsx";
// 提取 PDF 中的表格数据
var tableData = ExtractTableFromPdf(pdfPath);
// 将表格数据保存为 Excel 文件
SaveTableToText(tableData, excelPath);
Console.WriteLine("PDF 表格已成功转换为 Excel 文件。");
}
static List<List<string>> ExtractTableFromPdf(string pdfPath)
{
var tableData = new List<List<string>>();
using (var pdfDocument = PdfDocument.Open(pdfPath))
{
foreach (var page in pdfDocument.GetPages())
{
var words = page.GetWords(); // 获取页面中的单词
var lines = words.GroupBy(w => w.BoundingBox.Bottom) // 按行分组
.OrderByDescending(g => g.Key);
foreach (var line in lines)
{
var columns = line.OrderBy(w => w.BoundingBox.Left) // 按列排序
.Select(w => w.Text)
.ToList();
tableData.Add(columns);
}
}
}
return tableData;
}
static void SaveTableToText(List<List<string>> tableData, string excelPath)
{
for (int i = 0; i < tableData.Count; i++)
{
string tmpStr="";
for (int j = 0; j < tableData[i].Count; j++)
{
tmpStr = tmpStr + "|" + tableData[i][j];
}
Console.WriteLine("----------------------");
Console.WriteLine(tmpStr);
}
}
}