aboutsummaryrefslogtreecommitdiff
path: root/cmd/pdf
diff options
context:
space:
mode:
authorleshe4ka46 <alex9102naid1@ya.ru>2025-10-27 20:36:28 +0300
committerleshe4ka46 <alex9102naid1@ya.ru>2025-10-28 13:42:21 +0300
commitbb833561aa74f02970aee13cdc75973b29716491 (patch)
tree0914668e11dbf825979f7419ce1bc78294cd3f7f /cmd/pdf
parente17a425dfb3382310fb5863f516dacdca9f44956 (diff)
# This is a combination of 2 commits.
# This is the 1st commit message: unmarshal all formats, merge them in the single table, users are truly unique # This is the commit message #2: i
Diffstat (limited to 'cmd/pdf')
-rw-r--r--cmd/pdf/pdf.go126
1 files changed, 111 insertions, 15 deletions
diff --git a/cmd/pdf/pdf.go b/cmd/pdf/pdf.go
index cb6aeb7..4a185a1 100644
--- a/cmd/pdf/pdf.go
+++ b/cmd/pdf/pdf.go
@@ -1,55 +1,151 @@
+/*
+ * Extract vector lines and other paths for each page of a PDF file.
+ *
+ * Run as: go run pdf_extract_lines.go input.pdf
+ */
+
package main
import (
"fmt"
- "log"
"os"
+ "github.com/unidoc/unipdf/v4/creator"
"github.com/unidoc/unipdf/v4/extractor"
"github.com/unidoc/unipdf/v4/model"
)
func main() {
- f, err := os.Open("../../test.pdf")
+ // err := reconstruct("../../test.pdf")
+ // if err != nil {
+ // fmt.Printf("Error: %v\n", err)
+ // os.Exit(1)
+ // }
+
+ err := outputPdfLines("reconstr_words.pdf")
if err != nil {
- log.Fatalf("Failed to open PDF: %v\n", err)
+ fmt.Printf("Error: %v\n", err)
+ os.Exit(1)
}
+
+
+}
+
+// outputPdfLines prints out lines of PDF file to stdout.
+func outputPdfLines(inputPath string) error {
+ f, err := os.Open(inputPath)
+ if err != nil {
+ return err
+ }
+
defer f.Close()
+
pdfReader, err := model.NewPdfReader(f)
if err != nil {
- log.Fatalf("Failed to read PDF: %v\n", err)
+ return err
}
+
numPages, err := pdfReader.GetNumPages()
if err != nil {
- log.Fatalf("Failed to retrieve the number of pages: %v\n", err)
+ return err
}
- fmt.Printf("Total number of pages: %d\n", numPages)
+
+ // Iterate through pages.
fmt.Printf("--------------------\n")
- fmt.Printf("PDF to text extraction:\n")
+ fmt.Printf("PDF lines extraction:\n")
fmt.Printf("--------------------\n")
for i := 0; i < numPages; i++ {
pageNum := i + 1
page, err := pdfReader.GetPage(pageNum)
if err != nil {
- panic(err)
+ return err
}
ex, err := extractor.New(page)
if err != nil {
- panic(err)
+ return err
}
- text, err := ex.ExtractText()
+ fmt.Println("------------------------------")
+ fmt.Printf("Page %d:\n", pageNum)
+
+ // Extract stroke paths from the current page.
+ paths, err := ex.ExtractStrokePaths()
if err != nil {
- panic(err)
+ return err
}
- fmt.Println("------------------------------")
- fmt.Printf("Page %d:\n", pageNum)
- fmt.Printf("\"%s\"\n", text)
- fmt.Println("------------------------------")
+ // Print debugging info.
+ for i, path := range paths {
+ fmt.Printf("Path %d:\n", i)
+ for j, point := range path.Points {
+ fmt.Printf("Point %d: %f %f \n", j, point.X, point.Y)
+ }
+ }
+ }
+
+ return nil
+}
+
+func reconstruct(pdfPath string) error {
+ f, err := os.Open(pdfPath)
+ if err != nil {
+ return err
+ }
+ defer f.Close()
+
+ pdfr, err := model.NewPdfReaderLazy(f)
+ if err != nil {
+ return err
+ }
+
+ c := creator.New()
+
+ for pageNum := 1; pageNum <= len(pdfr.PageList); pageNum++ {
+ page, err := pdfr.GetPage(pageNum)
+ if err != nil {
+ return err
+ }
+
+ extr, err := extractor.New(page)
+ if err != nil {
+ return err
+ }
+ pageText, _, _, err := extr.ExtractPageText()
+ if err != nil {
+ return err
+ }
+
+ // Start on a new page.
+ c.NewPage()
+ fmt.Printf("Page %d\n", pageNum)
+
+ text := pageText.Text()
+ textmarks := pageText.Marks()
+ fmt.Printf("%s\n", text)
+
+ // Reconstruct the text, each single TextMark drawn at a time with creator.Paragraph.
+ for _, tm := range textmarks.Elements() {
+ if tm.Font == nil {
+ continue
+ }
+ fmt.Printf("%s\n", tm.Text)
+ // Reconstruct by drawing each glyph from textmarks with the creator package.
+ para := c.NewStyledParagraph()
+ para.SetText(tm.Original)
+ para.SetFont(tm.Font)
+ para.SetFontSize(tm.FontSize)
+ r, g, b, _ := tm.StrokeColor.RGBA()
+ rf, gf, bf := float64(r)/0xffff, float64(g)/0xffff, float64(b)/0xffff
+ para.SetFontColor(creator.ColorRGBFromArithmetic(rf, gf, bf))
+ // Convert to PDF coordinate system.
+ yPos := c.Context().PageHeight - (tm.BBox.Lly + tm.BBox.Height())
+ para.SetPos(tm.BBox.Llx, yPos) // Upper left corner.
+ c.Draw(para)
+ }
}
+ return c.WriteToFile("reconstr_words.pdf")
}