From bb833561aa74f02970aee13cdc75973b29716491 Mon Sep 17 00:00:00 2001 From: leshe4ka46 Date: Mon, 27 Oct 2025 20:36:28 +0300 Subject: # This is a combination of 2 commits. # This is the 1st commit message: unmarshal all formats, merge them in the single table, users are truly unique # This is the commit message #2: i --- cmd/pdf/pdf.go | 126 ++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 111 insertions(+), 15 deletions(-) (limited to 'cmd/pdf') diff --git a/cmd/pdf/pdf.go b/cmd/pdf/pdf.go index cb6aeb7..4a185a1 100644 --- a/cmd/pdf/pdf.go +++ b/cmd/pdf/pdf.go @@ -1,55 +1,151 @@ +/* + * Extract vector lines and other paths for each page of a PDF file. + * + * Run as: go run pdf_extract_lines.go input.pdf + */ + package main import ( "fmt" - "log" "os" + "github.com/unidoc/unipdf/v4/creator" "github.com/unidoc/unipdf/v4/extractor" "github.com/unidoc/unipdf/v4/model" ) func main() { - f, err := os.Open("../../test.pdf") + // err := reconstruct("../../test.pdf") + // if err != nil { + // fmt.Printf("Error: %v\n", err) + // os.Exit(1) + // } + + err := outputPdfLines("reconstr_words.pdf") if err != nil { - log.Fatalf("Failed to open PDF: %v\n", err) + fmt.Printf("Error: %v\n", err) + os.Exit(1) } + + +} + +// outputPdfLines prints out lines of PDF file to stdout. +func outputPdfLines(inputPath string) error { + f, err := os.Open(inputPath) + if err != nil { + return err + } + defer f.Close() + pdfReader, err := model.NewPdfReader(f) if err != nil { - log.Fatalf("Failed to read PDF: %v\n", err) + return err } + numPages, err := pdfReader.GetNumPages() if err != nil { - log.Fatalf("Failed to retrieve the number of pages: %v\n", err) + return err } - fmt.Printf("Total number of pages: %d\n", numPages) + + // Iterate through pages. fmt.Printf("--------------------\n") - fmt.Printf("PDF to text extraction:\n") + fmt.Printf("PDF lines extraction:\n") fmt.Printf("--------------------\n") for i := 0; i < numPages; i++ { pageNum := i + 1 page, err := pdfReader.GetPage(pageNum) if err != nil { - panic(err) + return err } ex, err := extractor.New(page) if err != nil { - panic(err) + return err } - text, err := ex.ExtractText() + fmt.Println("------------------------------") + fmt.Printf("Page %d:\n", pageNum) + + // Extract stroke paths from the current page. + paths, err := ex.ExtractStrokePaths() if err != nil { - panic(err) + return err } - fmt.Println("------------------------------") - fmt.Printf("Page %d:\n", pageNum) - fmt.Printf("\"%s\"\n", text) - fmt.Println("------------------------------") + // Print debugging info. + for i, path := range paths { + fmt.Printf("Path %d:\n", i) + for j, point := range path.Points { + fmt.Printf("Point %d: %f %f \n", j, point.X, point.Y) + } + } + } + + return nil +} + +func reconstruct(pdfPath string) error { + f, err := os.Open(pdfPath) + if err != nil { + return err + } + defer f.Close() + + pdfr, err := model.NewPdfReaderLazy(f) + if err != nil { + return err + } + + c := creator.New() + + for pageNum := 1; pageNum <= len(pdfr.PageList); pageNum++ { + page, err := pdfr.GetPage(pageNum) + if err != nil { + return err + } + + extr, err := extractor.New(page) + if err != nil { + return err + } + pageText, _, _, err := extr.ExtractPageText() + if err != nil { + return err + } + + // Start on a new page. + c.NewPage() + fmt.Printf("Page %d\n", pageNum) + + text := pageText.Text() + textmarks := pageText.Marks() + fmt.Printf("%s\n", text) + + // Reconstruct the text, each single TextMark drawn at a time with creator.Paragraph. + for _, tm := range textmarks.Elements() { + if tm.Font == nil { + continue + } + fmt.Printf("%s\n", tm.Text) + // Reconstruct by drawing each glyph from textmarks with the creator package. + para := c.NewStyledParagraph() + para.SetText(tm.Original) + para.SetFont(tm.Font) + para.SetFontSize(tm.FontSize) + r, g, b, _ := tm.StrokeColor.RGBA() + rf, gf, bf := float64(r)/0xffff, float64(g)/0xffff, float64(b)/0xffff + para.SetFontColor(creator.ColorRGBFromArithmetic(rf, gf, bf)) + // Convert to PDF coordinate system. + yPos := c.Context().PageHeight - (tm.BBox.Lly + tm.BBox.Height()) + para.SetPos(tm.BBox.Llx, yPos) // Upper left corner. + c.Draw(para) + } } + return c.WriteToFile("reconstr_words.pdf") } -- cgit v1.2.3