/* * Extract vector lines and other paths for each page of a PDF file. * * Run as: go run pdf_extract_lines.go input.pdf */ package main import ( "fmt" "os" "github.com/unidoc/unipdf/v4/creator" "github.com/unidoc/unipdf/v4/extractor" "github.com/unidoc/unipdf/v4/model" ) func main() { // err := reconstruct("../../test.pdf") // if err != nil { // fmt.Printf("Error: %v\n", err) // os.Exit(1) // } err := outputPdfLines("reconstr_words.pdf") if err != nil { fmt.Printf("Error: %v\n", err) os.Exit(1) } } // outputPdfLines prints out lines of PDF file to stdout. func outputPdfLines(inputPath string) error { f, err := os.Open(inputPath) if err != nil { return err } defer f.Close() pdfReader, err := model.NewPdfReader(f) if err != nil { return err } numPages, err := pdfReader.GetNumPages() if err != nil { return err } // Iterate through pages. fmt.Printf("--------------------\n") fmt.Printf("PDF lines extraction:\n") fmt.Printf("--------------------\n") for i := 0; i < numPages; i++ { pageNum := i + 1 page, err := pdfReader.GetPage(pageNum) if err != nil { return err } ex, err := extractor.New(page) if err != nil { return err } fmt.Println("------------------------------") fmt.Printf("Page %d:\n", pageNum) // Extract stroke paths from the current page. paths, err := ex.ExtractStrokePaths() if err != nil { return err } // Print debugging info. for i, path := range paths { fmt.Printf("Path %d:\n", i) for j, point := range path.Points { fmt.Printf("Point %d: %f %f \n", j, point.X, point.Y) } } } return nil } func reconstruct(pdfPath string) error { f, err := os.Open(pdfPath) if err != nil { return err } defer f.Close() pdfr, err := model.NewPdfReaderLazy(f) if err != nil { return err } c := creator.New() for pageNum := 1; pageNum <= len(pdfr.PageList); pageNum++ { page, err := pdfr.GetPage(pageNum) if err != nil { return err } extr, err := extractor.New(page) if err != nil { return err } pageText, _, _, err := extr.ExtractPageText() if err != nil { return err } // Start on a new page. c.NewPage() fmt.Printf("Page %d\n", pageNum) text := pageText.Text() textmarks := pageText.Marks() fmt.Printf("%s\n", text) // Reconstruct the text, each single TextMark drawn at a time with creator.Paragraph. for _, tm := range textmarks.Elements() { if tm.Font == nil { continue } fmt.Printf("%s\n", tm.Text) // Reconstruct by drawing each glyph from textmarks with the creator package. para := c.NewStyledParagraph() para.SetText(tm.Original) para.SetFont(tm.Font) para.SetFontSize(tm.FontSize) r, g, b, _ := tm.StrokeColor.RGBA() rf, gf, bf := float64(r)/0xffff, float64(g)/0xffff, float64(b)/0xffff para.SetFontColor(creator.ColorRGBFromArithmetic(rf, gf, bf)) // Convert to PDF coordinate system. yPos := c.Context().PageHeight - (tm.BBox.Lly + tm.BBox.Height()) para.SetPos(tm.BBox.Llx, yPos) // Upper left corner. c.Draw(para) } } return c.WriteToFile("reconstr_words.pdf") }