From bb833561aa74f02970aee13cdc75973b29716491 Mon Sep 17 00:00:00 2001 From: leshe4ka46 Date: Mon, 27 Oct 2025 20:36:28 +0300 Subject: # This is a combination of 2 commits. # This is the 1st commit message: unmarshal all formats, merge them in the single table, users are truly unique # This is the commit message #2: i --- cmd/airlines/main.go | 100 +++++++++++++++++++++++++++-------- cmd/analytics/analytics.go | 18 +++++++ cmd/csv/main.go | 18 +++++++ cmd/fiotest/fio.go | 15 ------ cmd/pdf/pdf.go | 126 +++++++++++++++++++++++++++++++++++++++------ cmd/store/test.go | 117 +++++++++++++++++++++++++++++++++++++++++ cmd/xlsx/main.go | 88 ++++++++++++++++++++----------- cmd/xml/xml.go | 32 ++++++++++++ cmd/yaml/main.go | 18 +++++++ 9 files changed, 451 insertions(+), 81 deletions(-) create mode 100644 cmd/analytics/analytics.go create mode 100644 cmd/csv/main.go delete mode 100644 cmd/fiotest/fio.go create mode 100644 cmd/store/test.go create mode 100644 cmd/xml/xml.go create mode 100644 cmd/yaml/main.go (limited to 'cmd') diff --git a/cmd/airlines/main.go b/cmd/airlines/main.go index 8eeaef5..1710d57 100644 --- a/cmd/airlines/main.go +++ b/cmd/airlines/main.go @@ -1,13 +1,15 @@ package main import ( - "context" - "encoding/json" "fmt" - "os" - ljson "airlines/pkg/adapters/json" - "airlines/pkg/store" + "airlines/pkg/adapters/csv" + "airlines/pkg/adapters/json" + "airlines/pkg/adapters/xlsx" + "airlines/pkg/adapters/xml" + "airlines/pkg/adapters/yaml" + + "airlines/pkg/localstore" "github.com/joho/godotenv" ) @@ -18,26 +20,80 @@ func main() { if err != nil { fmt.Println(err) } - store, err := store.NewStore(fmt.Sprintf("postgres://%s:%s@%s:%s/%s", os.Getenv("DB_USER"), os.Getenv("DB_PASSWORD"), os.Getenv("DB_HOST"), os.Getenv("DB_PORT"), os.Getenv("DB_NAME"))) - if err != nil { - panic(err) - } - _ = store.AutoMigrate() + //fmt.Sprintf("postgres://%s:%s@%s:%s/%s", os.Getenv("DB_USER"), os.Getenv("DB_PASSWORD"), os.Getenv("DB_HOST"), os.Getenv("DB_PORT"), os.Getenv("DB_NAME"))) + // store, err := store.NewStore("user=postgres dbname=airlines host=/home/alex/.pgsocket sslmode=disable") - // i, err := json.ImportForumProfilesJSON(context.Background(), store, "../../full.json", 16384) - // fmt.Println(i, err) - f, err := os.Open("../../full.json") + store := localstore.NewLocalStore() + fmt.Println("store created") - dec := json.NewDecoder(f) - // optional: be strict about unexpected fields - // dec.DisallowUnknownFields() + func() { + root, err := json.UnmarshalJsonRoot("/home/alex/ds-data/FrequentFlyerForum-Profiles.json") + if err != nil { + panic(err) + } + fmt.Println("\nunmarshalled json") - var root ljson.JsonRoot - if err := dec.Decode(&root); err != nil { - panic(err) - } + root.DumpToDb(store) + fmt.Println("\ndumped json") + + }() + + func() { + // xlsx + tickets, err := xlsx.UnmarshallXlsxFiles("/home/alex/ds-data/YourBoardingPassDotAero/") + if err != nil { + panic(err) + } + fmt.Println("\nunmarshalled xlsx") + + xlsx.DumpToDb(store, tickets) + fmt.Println("\ndumped xlsx to db") + }() + + func() { + xmldata, err := xml.UnmarshalXml("/home/alex/ds-data/PointzAggregator-AirlinesData.xml") + if err != nil { + panic(err) + } + fmt.Println("\nunmarshalled xml") + + xmldata.DumpToDb(store) + fmt.Println("\ndumped xml to db") + }() + + func() { + yamlData, err := yaml.UnmarshallYaml("/home/alex/ds-data/SkyTeam-Exchange.yaml") + if err != nil { + panic(err) + } + fmt.Println("\nunmarshalled yaml") + + yamlData.DumpToDb(store) + fmt.Println("\ndumped yaml to db") + }() + + func() { + csvData, err := csv.UnmarshallCsv("/home/alex/ds-data/csv.csv", false) + if err != nil { + panic(err) + } + fmt.Println("\nunmarshalled csv1") + + csvData.DumpToDb(store) + fmt.Println("\ndumped yaml to csv1") + }() + + // fuck it + func() { + csvData2, err := csv.UnmarshallCsv("/home/alex/ds-data/tab.csv", true) + if err != nil { + panic(err) + } + fmt.Println("\nunmarshalled csv2") - root.DumpToDb(context.Background(), store) - // fmt.Println(root) + csvData2.DumpToDb(store) + fmt.Println("\ndumped yaml to csv2") + }() + fmt.Println(store.ExportAllCSVs("/tmp/ds")) } diff --git a/cmd/analytics/analytics.go b/cmd/analytics/analytics.go new file mode 100644 index 0000000..e2cbdb9 --- /dev/null +++ b/cmd/analytics/analytics.go @@ -0,0 +1,18 @@ +package main + +import ( + "airlines/pkg/localstore" + "fmt" +) + + +func main() { + loc := localstore.NewLocalStore() + + loc.ImportAllCSVs("/tmp/ds") + + fmt.Println(loc.FindCard("FF", 0, "")); + + + loc.Analytics() +} diff --git a/cmd/csv/main.go b/cmd/csv/main.go new file mode 100644 index 0000000..e1e5174 --- /dev/null +++ b/cmd/csv/main.go @@ -0,0 +1,18 @@ +package main + +import ( + "airlines/pkg/adapters/csv" + "airlines/pkg/localstore" +) + +func main() { + yamlData, err := csv.UnmarshallCsv("/home/alex/ds-data/tab.csv", true) + if err != nil { + panic(err) + } + + store := localstore.NewLocalStore() + + yamlData.DumpToDb(store) + store.ExportAllCSVs("/tmp/ds") +} diff --git a/cmd/fiotest/fio.go b/cmd/fiotest/fio.go deleted file mode 100644 index 8195da7..0000000 --- a/cmd/fiotest/fio.go +++ /dev/null @@ -1,15 +0,0 @@ -package main - -import ( - "airlines/pkg/names" - "fmt" -) - -func main() { - - f, err := names.ParseLatinName("MAKAR A TIKHOMIROV") - if err != nil { - panic(err) - } - fmt.Printf("%+v\n", f) -} diff --git a/cmd/pdf/pdf.go b/cmd/pdf/pdf.go index cb6aeb7..4a185a1 100644 --- a/cmd/pdf/pdf.go +++ b/cmd/pdf/pdf.go @@ -1,55 +1,151 @@ +/* + * Extract vector lines and other paths for each page of a PDF file. + * + * Run as: go run pdf_extract_lines.go input.pdf + */ + package main import ( "fmt" - "log" "os" + "github.com/unidoc/unipdf/v4/creator" "github.com/unidoc/unipdf/v4/extractor" "github.com/unidoc/unipdf/v4/model" ) func main() { - f, err := os.Open("../../test.pdf") + // err := reconstruct("../../test.pdf") + // if err != nil { + // fmt.Printf("Error: %v\n", err) + // os.Exit(1) + // } + + err := outputPdfLines("reconstr_words.pdf") if err != nil { - log.Fatalf("Failed to open PDF: %v\n", err) + fmt.Printf("Error: %v\n", err) + os.Exit(1) } + + +} + +// outputPdfLines prints out lines of PDF file to stdout. +func outputPdfLines(inputPath string) error { + f, err := os.Open(inputPath) + if err != nil { + return err + } + defer f.Close() + pdfReader, err := model.NewPdfReader(f) if err != nil { - log.Fatalf("Failed to read PDF: %v\n", err) + return err } + numPages, err := pdfReader.GetNumPages() if err != nil { - log.Fatalf("Failed to retrieve the number of pages: %v\n", err) + return err } - fmt.Printf("Total number of pages: %d\n", numPages) + + // Iterate through pages. fmt.Printf("--------------------\n") - fmt.Printf("PDF to text extraction:\n") + fmt.Printf("PDF lines extraction:\n") fmt.Printf("--------------------\n") for i := 0; i < numPages; i++ { pageNum := i + 1 page, err := pdfReader.GetPage(pageNum) if err != nil { - panic(err) + return err } ex, err := extractor.New(page) if err != nil { - panic(err) + return err } - text, err := ex.ExtractText() + fmt.Println("------------------------------") + fmt.Printf("Page %d:\n", pageNum) + + // Extract stroke paths from the current page. + paths, err := ex.ExtractStrokePaths() if err != nil { - panic(err) + return err } - fmt.Println("------------------------------") - fmt.Printf("Page %d:\n", pageNum) - fmt.Printf("\"%s\"\n", text) - fmt.Println("------------------------------") + // Print debugging info. + for i, path := range paths { + fmt.Printf("Path %d:\n", i) + for j, point := range path.Points { + fmt.Printf("Point %d: %f %f \n", j, point.X, point.Y) + } + } + } + + return nil +} + +func reconstruct(pdfPath string) error { + f, err := os.Open(pdfPath) + if err != nil { + return err + } + defer f.Close() + + pdfr, err := model.NewPdfReaderLazy(f) + if err != nil { + return err + } + + c := creator.New() + + for pageNum := 1; pageNum <= len(pdfr.PageList); pageNum++ { + page, err := pdfr.GetPage(pageNum) + if err != nil { + return err + } + + extr, err := extractor.New(page) + if err != nil { + return err + } + pageText, _, _, err := extr.ExtractPageText() + if err != nil { + return err + } + + // Start on a new page. + c.NewPage() + fmt.Printf("Page %d\n", pageNum) + + text := pageText.Text() + textmarks := pageText.Marks() + fmt.Printf("%s\n", text) + + // Reconstruct the text, each single TextMark drawn at a time with creator.Paragraph. + for _, tm := range textmarks.Elements() { + if tm.Font == nil { + continue + } + fmt.Printf("%s\n", tm.Text) + // Reconstruct by drawing each glyph from textmarks with the creator package. + para := c.NewStyledParagraph() + para.SetText(tm.Original) + para.SetFont(tm.Font) + para.SetFontSize(tm.FontSize) + r, g, b, _ := tm.StrokeColor.RGBA() + rf, gf, bf := float64(r)/0xffff, float64(g)/0xffff, float64(b)/0xffff + para.SetFontColor(creator.ColorRGBFromArithmetic(rf, gf, bf)) + // Convert to PDF coordinate system. + yPos := c.Context().PageHeight - (tm.BBox.Lly + tm.BBox.Height()) + para.SetPos(tm.BBox.Llx, yPos) // Upper left corner. + c.Draw(para) + } } + return c.WriteToFile("reconstr_words.pdf") } diff --git a/cmd/store/test.go b/cmd/store/test.go new file mode 100644 index 0000000..d05ae06 --- /dev/null +++ b/cmd/store/test.go @@ -0,0 +1,117 @@ +package main + +import ( + "fmt" + "time" + + "airlines/pkg/localstore" + "airlines/pkg/model" +) + +func main() { + store := localstore.NewLocalStore() + fmt.Println("store created") + + u := &model.User{ + Name: "a", + Surname: "b", + Fathersname: "A", + } + + u1, err := store.SaveUser(u) + + if err != nil { + fmt.Println("error saving user:", err) + return + } + + fmt.Println("user saved:", u1) + + u = &model.User{ + Name: "c", + Surname: "d", + } + + u2, err := store.SaveUser(u) + if err != nil { + fmt.Println("error saving user:", err) + return + } + + fmt.Println("user saved:", u2) + + + u = &model.User{ + Name: "a", + Surname: "b", + Fathersname: "ABBBBB", + Birthday: time.Now(), + } + + u3, err := store.SaveUser(u) + + if err != nil { + fmt.Println("error saving user:", err) + return + } + fmt.Println("user saved:", u3) + + now := time.Now() + now = time.Date(now.Year(), now.Month(), now.Day(), 0, 0, 0, 0, time.UTC) + + f := &model.Flight{ + Number: "AB123", + From: "JFK", + To: "LAX", + Date: now, + } + + f1, err := store.SaveFlight(f) + if err != nil { + fmt.Println("error saving flight:", err) + return + } + fmt.Println("flight saved:", f1) + + f = &model.Flight{ + Number: "CD456", + From: "LAX", + To: "SFO", + Date: time.Now(), + } + f2, err := store.SaveFlight(f) + if err != nil { + fmt.Println("error saving flight:", err) + return + } + fmt.Println("flight saved:", f2) + + + f = &model.Flight{ + Number: "AB123", + From: "JFK", + To: "LAX", + Date: now.Add(10 * time.Second), + HasTime: true, + } + + f3, err := store.SaveFlight(f) + if err != nil { + fmt.Println("error saving flight:", err) + return + } + fmt.Println("flight saved:", f3) + + + + f4, err := store.SaveFlight(f) + if err != nil { + fmt.Println("error saving flight:", err) + return + } + fmt.Println("flight saved:", f4) + + + + // fmt.Println(store.ExportAllCSVs("/tmp/ds")) +} diff --git a/cmd/xlsx/main.go b/cmd/xlsx/main.go index 4ccb01e..b960448 100644 --- a/cmd/xlsx/main.go +++ b/cmd/xlsx/main.go @@ -2,46 +2,76 @@ package main import ( "airlines/pkg/adapters/xlsx" + csvwriter "airlines/pkg/csvWriter" "fmt" "os" + "sync" ) -// func readXLSX(path string) { -// tickets, err := xlsx.UnmarshallXlsxFile("/home/alex/ds-data/YourBoardingPassDotAero/YourBoardingPassDotAero-2017-11-30.xlsx") -// if err != nil { -// panic(err) -// } - -// } - func main() { tickets := make([]xlsx.Ticket, 0) baseDir := "/home/alex/ds-data/YourBoardingPassDotAero/" items, _ := os.ReadDir(baseDir) + var mu sync.Mutex + var wg sync.WaitGroup + sem := make(chan struct{}, 8) for _, item := range items { if !item.IsDir() { - fmt.Println("Processing file:", item.Name()) - parsedTickets, err := xlsx.UnmarshallXlsxFile(baseDir + item.Name()) - if err != nil { - panic(err) - } - tickets = append(tickets, parsedTickets...) + wg.Add(1) + sem <- struct{}{} + go func(name string) { + defer func() { <-sem }() + defer wg.Done() + fmt.Println("Processing file:", name) + parsedTickets, err := xlsx.UnmarshallXlsxFile(baseDir + name) + if err != nil { + panic(err) + } + mu.Lock() + defer mu.Unlock() + tickets = append(tickets, parsedTickets...) + + }(item.Name()) } } - // for _, ticket := range tickets { - // u, err := ticket.ToUser() - // if err != nil { - // panic(err) - // } - // f, err := ticket.ToFlight() - // if err != nil { - // panic(err) - // } - // c, err := ticket.ToCard() - // if err != nil { - // panic(err) - // } - // fmt.Printf("%+v %+v %+v\n", u, f, c) - // } + wg.Wait() + + fmt.Println("finished") + + file, err := csvwriter.NewCsvWriter("/tmp/output.csv") + if err != nil { + panic(err) + } + defer file.Close() + + file.Write([]string{"Number", "FromAer", "FromCoordsLat", "FromCoordsLong", "ToAer", "ToCoordsLat", "ToCoordsLong", "DateUnix"}) + + for i, ticket := range tickets { + if i%(len(tickets)/100) == 0 { + fmt.Printf("%f\n", float32(i)/float32(len(tickets))*100) + } + if i%(len(tickets)/100*5) == 0 { + file.Sync() + } + // u, err := ticket.ToUser() + // if err != nil { + // panic(err) + // } + f, err := ticket.ToFlight() + if err != nil { + panic(err) + } + file.Write([]string{f.Number, + f.From, fmt.Sprintf("%v", f.FromCoords.Lat), fmt.Sprintf("%v", f.FromCoords.Long), + f.To, fmt.Sprintf("%v", f.ToCoords.Lat), fmt.Sprintf("%v", f.ToCoords.Long), + fmt.Sprintf("%v", f.Date.Unix()), + }) + // c, err := ticket.ToCard() + // if err != nil { + // panic(err) + // } + // fmt.Printf("%+v %+v %+v\n", u, f, c) + } + file.Sync() } diff --git a/cmd/xml/xml.go b/cmd/xml/xml.go new file mode 100644 index 0000000..2679114 --- /dev/null +++ b/cmd/xml/xml.go @@ -0,0 +1,32 @@ +package main + +import ( + "airlines/pkg/adapters/xml" + "airlines/pkg/localstore" + "fmt" +) + +func main() { + pointzUsers, err := xml.UnmarshalXml("/home/alex/ds-data/PointzAggregator-AirlinesData.xml") + if err != nil { + panic(err) + } + + fmt.Println("unmarshall ok") + + // for _, user := range pointzUsers.Users { + // fmt.Printf("User UID: %s, Name: %s %s\n", user.UID, user.Name.First, user.Name.Last) + // for _, card := range user.Cards.Card { + // fmt.Printf(" Card Number: %s, Program: %s\n", card.Number, card.Program) + // for _, activity := range card.Activities.Activitys { + // fmt.Printf(" Activity Type: %s, Code: %s, Date: %s, Departure: %s, Arrival: %s, Fare: %s\n", + // activity.Type, activity.Code, activity.Date, activity.Departure, activity.Arrival, activity.Fare) + // } + // } + // } + store := localstore.NewLocalStore() + + pointzUsers.DumpToDb(store) + + store.ExportAllCSVs("/tmp/ds") +} diff --git a/cmd/yaml/main.go b/cmd/yaml/main.go new file mode 100644 index 0000000..232ba79 --- /dev/null +++ b/cmd/yaml/main.go @@ -0,0 +1,18 @@ +package main + +import ( + "airlines/pkg/adapters/yaml" + "airlines/pkg/localstore" +) + +func main() { + yamlData, err := yaml.UnmarshallYaml("/home/alex/ds-data/SkyTeam-Exchange.yaml") + if err != nil { + panic(err) + } + + store := localstore.NewLocalStore() + + yamlData.DumpToDb(store) + store.ExportAllCSVs("/tmp/ds") +} -- cgit v1.2.3