diff options
| -rw-r--r-- | Makefile | 2 | ||||
| -rw-r--r-- | cmd/airlines/main.go | 1 | ||||
| -rw-r--r-- | cmd/fiotest/fio.go | 15 | ||||
| -rw-r--r-- | cmd/xlsx/main.go | 46 | ||||
| -rw-r--r-- | go.mod | 19 | ||||
| -rw-r--r-- | go.sum | 25 | ||||
| -rw-r--r-- | pkg/adapters/json/json.go | 4 | ||||
| -rw-r--r-- | pkg/adapters/json/model.go | 8 | ||||
| -rw-r--r-- | pkg/adapters/xlsx/model.go | 138 | ||||
| -rw-r--r-- | pkg/adapters/xlsx/registry.go | 69 | ||||
| -rw-r--r-- | pkg/adapters/xlsx/xlsx.go | 90 | ||||
| -rw-r--r-- | pkg/names/fio.go | 194 | ||||
| -rw-r--r-- | pkg/names/gender.go | 47 | ||||
| -rw-r--r-- | pkg/store/db.go | 1 |
14 files changed, 647 insertions, 12 deletions
@@ -1,6 +1,4 @@ all: airlines - - airlines: go run -C ./cmd/airlines .
\ No newline at end of file diff --git a/cmd/airlines/main.go b/cmd/airlines/main.go index 092a280..8eeaef5 100644 --- a/cmd/airlines/main.go +++ b/cmd/airlines/main.go @@ -18,7 +18,6 @@ func main() { if err != nil { fmt.Println(err) } - store, err := store.NewStore(fmt.Sprintf("postgres://%s:%s@%s:%s/%s", os.Getenv("DB_USER"), os.Getenv("DB_PASSWORD"), os.Getenv("DB_HOST"), os.Getenv("DB_PORT"), os.Getenv("DB_NAME"))) if err != nil { panic(err) diff --git a/cmd/fiotest/fio.go b/cmd/fiotest/fio.go new file mode 100644 index 0000000..8195da7 --- /dev/null +++ b/cmd/fiotest/fio.go @@ -0,0 +1,15 @@ +package main + +import ( + "airlines/pkg/names" + "fmt" +) + +func main() { + + f, err := names.ParseLatinName("MAKAR A TIKHOMIROV") + if err != nil { + panic(err) + } + fmt.Printf("%+v\n", f) +} diff --git a/cmd/xlsx/main.go b/cmd/xlsx/main.go new file mode 100644 index 0000000..053cf66 --- /dev/null +++ b/cmd/xlsx/main.go @@ -0,0 +1,46 @@ +package main + +import ( + "airlines/pkg/adapters/xlsx" + "fmt" + "io/ioutil" +) + +func readXLSX(path string) { + tickets, err := xlsx.UnmarshallXlsxFile("/home/alex/ds-data/YourBoardingPassDotAero/YourBoardingPassDotAero-2017-11-30.xlsx") + if err != nil { + panic(err) + } + for _, ticket := range tickets { + u, err := ticket.ToUser() + if err != nil { + panic(err) + } + f, err := ticket.ToFlight() + if err != nil { + panic(err) + } + c, err := ticket.ToCard() + if err != nil { + panic(err) + } + fmt.Printf("%+v %+v %+v\n", u, f, c) + } +} + +func main() { + tickets := make([]xlsx.Ticket, 0) + baseDir := "/home/alex/ds-data/YourBoardingPassDotAero/" + items, _ := ioutil.ReadDir(baseDir) + for _, item := range items { + if !item.IsDir() { + fmt.Println("Processing file:", item.Name()) + parsedTickets, err := xlsx.UnmarshallXlsxFile(baseDir + item.Name()) + if err != nil { + panic(err) + } + tickets = append(tickets, parsedTickets...) + } + } + +} @@ -3,16 +3,29 @@ module airlines go 1.25.1 require ( + github.com/joho/godotenv v1.5.1 + gorm.io/driver/sqlite v1.6.0 +) + +require ( github.com/jackc/pgpassfile v1.0.0 // indirect github.com/jackc/pgservicefile v0.0.0-20240606120523-5a60cdf6a761 // indirect github.com/jackc/pgx/v5 v5.6.0 // indirect github.com/jackc/puddle/v2 v2.2.2 // indirect github.com/jinzhu/inflection v1.0.0 // indirect github.com/jinzhu/now v1.1.5 // indirect - github.com/joho/godotenv v1.5.1 - golang.org/x/crypto v0.31.0 // indirect + github.com/leonm1/airports-go v0.0.0-20180324035101-5e1c3ff18691 + github.com/mattn/go-sqlite3 v1.14.22 // indirect + github.com/richardlehane/mscfb v1.0.4 // indirect + github.com/richardlehane/msoleps v1.0.4 // indirect + github.com/tiendc/go-deepcopy v1.7.1 // indirect + github.com/xuri/efp v0.0.1 // indirect + github.com/xuri/excelize/v2 v2.10.0 // indirect + github.com/xuri/nfp v0.0.2-0.20250530014748-2ddeb826f9a9 // indirect + golang.org/x/crypto v0.43.0 // indirect + golang.org/x/net v0.46.0 // indirect golang.org/x/sync v0.17.0 // indirect - golang.org/x/text v0.29.0 // indirect + golang.org/x/text v0.30.0 // indirect gorm.io/driver/postgres v1.6.0 // indirect gorm.io/gorm v1.31.0 // indirect ) @@ -17,12 +17,33 @@ github.com/jinzhu/now v1.1.5 h1:/o9tlHleP7gOFmsnYNz3RGnqzefHA47wQpKrrdTIwXQ= github.com/jinzhu/now v1.1.5/go.mod h1:d3SSVoowX0Lcu0IBviAWJpolVfI5UJVZZ7cO71lE/z8= github.com/joho/godotenv v1.5.1 h1:7eLL/+HRGLY0ldzfGMeQkb7vMd0as4CfYvUVzLqw0N0= github.com/joho/godotenv v1.5.1/go.mod h1:f4LDr5Voq0i2e/R5DDNOoa2zzDfwtkZa6DnEwAbqwq4= +github.com/leonm1/airports-go v0.0.0-20180324035101-5e1c3ff18691 h1:p6JA+fwoJonS0U+lIuOjjwujcuJ79zK+45/s0AJ1dUM= +github.com/leonm1/airports-go v0.0.0-20180324035101-5e1c3ff18691/go.mod h1:NOvrhZvg7XCKq9koo59F4oSDTvmJIlJ/EEmtpIJBgMg= +github.com/mattn/go-sqlite3 v1.14.22 h1:2gZY6PC6kBnID23Tichd1K+Z0oS6nE/XwU+Vz/5o4kU= +github.com/mattn/go-sqlite3 v1.14.22/go.mod h1:Uh1q+B4BYcTPb+yiD3kU8Ct7aC0hY9fxUwlHK0RXw+Y= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= +github.com/richardlehane/mscfb v1.0.4 h1:WULscsljNPConisD5hR0+OyZjwK46Pfyr6mPu5ZawpM= +github.com/richardlehane/mscfb v1.0.4/go.mod h1:YzVpcZg9czvAuhk9T+a3avCpcFPMUWm7gK3DypaEsUk= +github.com/richardlehane/msoleps v1.0.1/go.mod h1:BWev5JBpU9Ko2WAgmZEuiz4/u3ZYTKbjLycmwiWUfWg= +github.com/richardlehane/msoleps v1.0.4 h1:WuESlvhX3gH2IHcd8UqyCuFY5yiq/GR/yqaSM/9/g00= +github.com/richardlehane/msoleps v1.0.4/go.mod h1:BWev5JBpU9Ko2WAgmZEuiz4/u3ZYTKbjLycmwiWUfWg= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= +github.com/tiendc/go-deepcopy v1.7.1 h1:LnubftI6nYaaMOcaz0LphzwraqN8jiWTwm416sitff4= +github.com/tiendc/go-deepcopy v1.7.1/go.mod h1:4bKjNC2r7boYOkD2IOuZpYjmlDdzjbpTRyCx+goBCJQ= +github.com/xuri/efp v0.0.1 h1:fws5Rv3myXyYni8uwj2qKjVaRP30PdjeYe2Y6FDsCL8= +github.com/xuri/efp v0.0.1/go.mod h1:ybY/Jr0T0GTCnYjKqmdwxyxn2BQf2RcQIIvex5QldPI= +github.com/xuri/excelize/v2 v2.10.0 h1:8aKsP7JD39iKLc6dH5Tw3dgV3sPRh8uRVXu/fMstfW4= +github.com/xuri/excelize/v2 v2.10.0/go.mod h1:SC5TzhQkaOsTWpANfm+7bJCldzcnU/jrhqkTi/iBHBU= +github.com/xuri/nfp v0.0.2-0.20250530014748-2ddeb826f9a9 h1:+C0TIdyyYmzadGaL/HBLbf3WdLgC29pgyhTjAT/0nuE= +github.com/xuri/nfp v0.0.2-0.20250530014748-2ddeb826f9a9/go.mod h1:WwHg+CVyzlv/TX9xqBFXEZAuxOPxn2k1GNHwG41IIUQ= golang.org/x/crypto v0.31.0 h1:ihbySMvVjLAeSH1IbfcRTkD/iNscyz8rGzjF/E5hV6U= golang.org/x/crypto v0.31.0/go.mod h1:kDsLvtWBEx7MV9tJOj9bnXsPbxwJQ6csT/x4KIN4Ssk= +golang.org/x/crypto v0.43.0 h1:dduJYIi3A3KOfdGOHX8AVZ/jGiyPa3IbBozJ5kNuE04= +golang.org/x/crypto v0.43.0/go.mod h1:BFbav4mRNlXJL4wNeejLpWxB7wMbc79PdRGhWKncxR0= +golang.org/x/net v0.46.0 h1:giFlY12I07fugqwPuWJi68oOnpfqFnJIJzaIIm2JVV4= +golang.org/x/net v0.46.0/go.mod h1:Q9BGdFy1y4nkUwiLvT5qtyhAnEHgnQ/zd8PfU6nc210= golang.org/x/sync v0.10.0 h1:3NQrjDixjgGwUOCaF8w2+VYHv0Ve/vGYSbdkTa98gmQ= golang.org/x/sync v0.10.0/go.mod h1:Czt+wKu1gCyEFDUtn0jG5QVvpJ6rzVqr5aXyt9drQfk= golang.org/x/sync v0.17.0 h1:l60nONMj9l5drqw6jlhIELNv9I0A4OFgRsG9k2oT9Ug= @@ -31,12 +52,16 @@ golang.org/x/text v0.21.0 h1:zyQAAkrwaneQ066sspRyJaG9VNi/YJ1NfzcGB3hZ/qo= golang.org/x/text v0.21.0/go.mod h1:4IBbMaMmOPCJ8SecivzSH54+73PCFmPWxNTLm+vZkEQ= golang.org/x/text v0.29.0 h1:1neNs90w9YzJ9BocxfsQNHKuAT4pkghyXc4nhZ6sJvk= golang.org/x/text v0.29.0/go.mod h1:7MhJOA9CD2qZyOKYazxdYMF85OwPdEr9jTtBpO7ydH4= +golang.org/x/text v0.30.0 h1:yznKA/E9zq54KzlzBEAWn1NXSQ8DIp/NYMy88xJjl4k= +golang.org/x/text v0.30.0/go.mod h1:yDdHFIX9t+tORqspjENWgzaCVXgk0yYnYuSZ8UzzBVM= gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= gorm.io/driver/mysql v1.6.0 h1:eNbLmNTpPpTOVZi8MMxCi2aaIm0ZpInbORNXDwyLGvg= gorm.io/driver/mysql v1.6.0/go.mod h1:D/oCC2GWK3M/dqoLxnOlaNKmXz8WNTfcS9y5ovaSqKo= gorm.io/driver/postgres v1.6.0 h1:2dxzU8xJ+ivvqTRph34QX+WrRaJlmfyPqXmoGVjMBa4= gorm.io/driver/postgres v1.6.0/go.mod h1:vUw0mrGgrTK+uPHEhAdV4sfFELrByKVGnaVRkXDhtWo= +gorm.io/driver/sqlite v1.6.0 h1:WHRRrIiulaPiPFmDcod6prc4l2VGVWHz80KspNsxSfQ= +gorm.io/driver/sqlite v1.6.0/go.mod h1:AO9V1qIQddBESngQUKWL9yoH93HIeA1X6V633rBwyT8= gorm.io/gorm v1.25.10 h1:dQpO+33KalOA+aFYGlK+EfxcI5MbO7EP2yYygwh9h+s= gorm.io/gorm v1.25.10/go.mod h1:hbnx/Oo0ChWMn1BIhpy1oYozzpM15i4YPuHDmfYtwg8= gorm.io/gorm v1.31.0 h1:0VlycGreVhK7RF/Bwt51Fk8v0xLiiiFdbGDPIZQ7mJY= diff --git a/pkg/adapters/json/json.go b/pkg/adapters/json/json.go index c0ea4e4..47a563e 100644 --- a/pkg/adapters/json/json.go +++ b/pkg/adapters/json/json.go @@ -100,7 +100,7 @@ type JsonCard struct { func (r *JsonRoot) DumpToDb(ctx context.Context, s *store.Store) { var err error for _, user := range r.ForumProfiles { - dbUser := user.ToUser() + dbUser, _ := user.ToUser() dbUser, err = s.CreateOrGetUser(ctx, dbUser) if err != nil { panic(err) @@ -120,7 +120,7 @@ func (r *JsonRoot) DumpToDb(ctx context.Context, s *store.Store) { } for _, flight := range user.RegisteredFlights { - dbFlight := flight.ToFlight() + dbFlight, _ := flight.ToFlight() _, err = s.AddFlightToUser(ctx, dbUser.ID, dbFlight) if err != nil { fmt.Println(err) diff --git a/pkg/adapters/json/model.go b/pkg/adapters/json/model.go index 2a91f21..2cc5d8e 100644 --- a/pkg/adapters/json/model.go +++ b/pkg/adapters/json/model.go @@ -32,7 +32,7 @@ func onlyDigits(s string) string { return string(out) } -func (jp JsonProfile) ToUser() *model.User { +func (jp JsonProfile) ToUser() (*model.User, error) { return &model.User{ Name: sOrEmpty(jp.RealName.FirstName), Surname: sOrEmpty(jp.RealName.LastName), @@ -40,10 +40,10 @@ func (jp JsonProfile) ToUser() *model.User { Fathersname: "", Sex: jp.Sex, Birthday: model.SentinelBirthday(), - } + }, nil } -func (jf JsonFlight) ToFlight() *model.Flight { +func (jf JsonFlight) ToFlight() (*model.Flight, error) { return &model.Flight{ Number: jf.Flight, From: jf.Departure.Airport, @@ -53,7 +53,7 @@ func (jf JsonFlight) ToFlight() *model.Flight { ToCity: jf.Arrival.City, ToCountry: jf.Arrival.Country, Date: jf.Date.ToDateUTC(), - } + }, nil } func (jc JsonCard) ToCard() (*model.Card, error) { diff --git a/pkg/adapters/xlsx/model.go b/pkg/adapters/xlsx/model.go new file mode 100644 index 0000000..d8c5194 --- /dev/null +++ b/pkg/adapters/xlsx/model.go @@ -0,0 +1,138 @@ +package xlsx + +import ( + "errors" + "regexp" + "strconv" + "strings" + "time" + + "github.com/leonm1/airports-go" +) + +type Ticket struct { + Sheet string + Passenger string + Title string + FlightNumber string + FromCity string + ToCity string + FromAirport string + ToAirport string + FlightDate string // (raw, expected YYYY-MM-DD; Excel text may start with ') + FlightTime string // (raw, expected HH-MM or HH:MM; Excel text may start with ') + PNR string + Card string + TicketNumber string // (may have a leading ' in Excel) +} + +func (t Ticket) DateTime() (time.Time, *time.Location, error) { + loc := t.inferLocationFromAirports() + date := strings.TrimLeft(strings.TrimSpace(t.FlightDate), "'") + hm := strings.TrimLeft(strings.TrimSpace(t.FlightTime), "'") + hm = strings.ReplaceAll(hm, "-", ":") + + if date == "" || hm == "" { + return time.Time{}, loc, errors.New("missing FlightDate or FlightTime") + } + ts, err := time.ParseInLocation("2006-01-02 15:04", date+" "+hm, loc) + return ts, loc, err +} + +func (t Ticket) inferLocationFromAirports() *time.Location { + if loc := iataToLocation(t.FromAirport); loc != nil { + return loc + } + if loc := iataToLocation(t.ToAirport); loc != nil { + return loc + } + return time.Local +} + +func iataToLocation(code string) *time.Location { + iata := strings.ToUpper(strings.TrimSpace(code)) + if len(iata) != 3 { + return nil + } + ap, err := airports.LookupIATA(iata) + if err != nil { + return nil + } + // Prefer IANA tz name + if tz := strings.TrimSpace(ap.Tz); tz != "" && tz != `\N` { + if loc, err := time.LoadLocation(tz); err == nil { + return loc + } + } + // Fallback: fixed offset (no DST) + if ap.Timezone != 0 { + sec := int(ap.Timezone * 3600.0) + return time.FixedZone("UTC"+offsetLabel(sec), sec) + } + return nil +} + +func offsetLabel(sec int) string { + sign := "+" + if sec < 0 { + sign = "-" + sec = -sec + } + h := sec / 3600 + m := (sec % 3600) / 60 + return sign + two(h) + ":" + two(m) +} +func two(x int) string { + if x < 10 { + return "0" + strconv.Itoa(x) + } + return strconv.Itoa(x) +} + +func parseCardLine(s string) (prefix string, number uint64, bonus string) { + raw := strings.TrimSpace(s) + if raw == "" { + return "", 0, "" + } + // number = last run of digits + if m := regexp.MustCompile(`(\d{3,})\D*$`).FindStringSubmatch(raw); len(m) == 2 { + if n, err := strconv.ParseUint(m[1], 10, 64); err == nil { + number = n + } + } + + // tokens (letters with '-', '/', apostrophes) + tokRe := regexp.MustCompile(`[A-Za-z][A-Za-z'/-]*`) + toks := tokRe.FindAllString(s, -1) + + // prefix = first 2–3 letter all-caps-ish token + for _, t := range toks { + u := strings.ToUpper(t) + if len(u) >= 2 && len(u) <= 3 && regexp.MustCompile(`^[A-Z]{2,3}$`).MatchString(u) { + prefix = u + break + } + } + // bonus = all tokens except prefix + words := []string{} + for _, t := range toks { + if strings.ToUpper(t) == prefix { + continue + } + words = append(words, t) + } + if len(words) > 0 { + bonus = strings.Join(words, " ") + } + if bonus == "" && prefix != "" { + bonus = prefix + } + return +} + +func firstNonEmpty(a, b string) string { + if strings.TrimSpace(a) != "" { + return a + } + return b +} diff --git a/pkg/adapters/xlsx/registry.go b/pkg/adapters/xlsx/registry.go new file mode 100644 index 0000000..46c395e --- /dev/null +++ b/pkg/adapters/xlsx/registry.go @@ -0,0 +1,69 @@ +package xlsx + +import ( + "fmt" + "strings" + + "airlines/pkg/model" + "airlines/pkg/names" + + "github.com/leonm1/airports-go" +) + +func (t Ticket) ToUser() (model.User, error) { + fio, err := names.ParseLatinName(t.Passenger) + if err != nil { + return model.User{}, fmt.Errorf("%v %s", t.Sheet, err.Error()) + } + sex := names.GenderFromTitle(t.Title) + + u := model.User{ + Nick: "", + Name: fio.First, + Surname: fio.Last, + Fathersname: fio.Patronymic, + Sex: sex, + } + return u, nil +} + +func (t Ticket) ToCard() (model.Card, error) { + prefix, number, bonus := parseCardLine(t.Card) + if number == 0 && prefix == "" && bonus == "" { + return model.Card{}, nil + } + return model.Card{ + Prefix: prefix, + Number: number, + Bonusprogramm: "", + }, nil +} + +func (t Ticket) ToFlight() (model.Flight, error) { + // Resolve IATA records + fromIATA := strings.ToUpper(strings.TrimSpace(t.FromAirport)) + toIATA := strings.ToUpper(strings.TrimSpace(t.ToAirport)) + + fromRec, _ := airports.LookupIATA(fromIATA) + toRec, _ := airports.LookupIATA(toIATA) + + fromCity := firstNonEmpty(strings.TrimSpace(t.FromCity), fromRec.City) + toCity := firstNonEmpty(strings.TrimSpace(t.ToCity), toRec.City) + + fromCountry := fromRec.Country + toCountry := toRec.Country + departUTC, _, err := t.DateTime() + if err != nil { + return model.Flight{}, err + } + return model.Flight{ + Number: strings.TrimSpace(t.FlightNumber), + From: fromIATA, + FromCity: fromCity, + FromCountry: fromCountry, + To: toIATA, + ToCity: toCity, + ToCountry: toCountry, + Date: departUTC, + }, nil +} diff --git a/pkg/adapters/xlsx/xlsx.go b/pkg/adapters/xlsx/xlsx.go new file mode 100644 index 0000000..6ef9baa --- /dev/null +++ b/pkg/adapters/xlsx/xlsx.go @@ -0,0 +1,90 @@ +package xlsx + +import ( + "fmt" + "strings" + + "github.com/xuri/excelize/v2" +) + +func UnmarshallXlsxFile(fname string) ([]Ticket, error) { + var err error + f, err := excelize.OpenFile(fname) + if err != nil { + return nil, err + } + defer func() { + if err = f.Close(); err != nil { + fmt.Println(err) + } + }() + + get := func(sheet, cell string) (string, error) { + v, err := f.GetCellValue(sheet, cell) + if err != nil { + return "", fmt.Errorf("%s %s: %w", sheet, cell, err) + } + v = strings.Trim(v, " `'\"") + return v, nil + } + + sheetMap := f.GetSheetMap() + tickets := make([]Ticket, 0, len(sheetMap)) + + for _, sheet := range sheetMap { + t := Ticket{} + t.Sheet = sheet + + t.Passenger, err = get(sheet, "B3") + if err != nil { + return nil, err + } + t.Title, err = get(sheet, "A3") + if err != nil { + return nil, err + } + t.FlightNumber, err = get(sheet, "A5") + if err != nil { + return nil, err + } + t.FromCity, err = get(sheet, "D5") + if err != nil { + return nil, err + } + t.ToCity, err = get(sheet, "H5") + if err != nil { + return nil, err + } + t.FromAirport, err = get(sheet, "D7") + if err != nil { + return nil, err + } + t.ToAirport, err = get(sheet, "H7") + if err != nil { + return nil, err + } + t.FlightDate, err = get(sheet, "A9") + if err != nil { + return nil, err + } + t.FlightTime, err = get(sheet, "C9") + if err != nil { + return nil, err + } + t.PNR, err = get(sheet, "B13") + if err != nil { + return nil, err + } + t.Card, err = get(sheet, "F3") + if err != nil { + return nil, err + } + t.TicketNumber, err = get(sheet, "E13") + if err != nil { + return nil, err + } + + tickets = append(tickets, t) + } + return tickets, nil +} diff --git a/pkg/names/fio.go b/pkg/names/fio.go new file mode 100644 index 0000000..4ecca7e --- /dev/null +++ b/pkg/names/fio.go @@ -0,0 +1,194 @@ +package names + +import ( + "errors" + "regexp" + "strings" + "unicode" +) + +type Parts struct { + First string + Last string + Patronymic string // may be "" or an initial like "F" +} + +// ParseLatinName parses 2–3 tokens containing First/Last and optional patronymic (1–2 letters). +// Tokens may be in any order, e.g. "PETROVSKAYA KARINA" or "RUSLAN F EVSEEV". +func ParseLatinName(s string) (Parts, error) { + toks := tokenizeLatin(s) // keeps letters, apostrophes, hyphens, optional trailing dot + if len(toks) < 2 || len(toks) > 3 { + return Parts{}, errors.New("expecting 2 or 3 name parts") + } + + type part struct { + raw string + lo string + } + ps := make([]part, 0, len(toks)) + for _, t := range toks { + lo := strings.ToLower(strings.TrimSuffix(t, ".")) + ps = append(ps, part{raw: t, lo: lo}) + } + + // 1) Patronymic: 1–2 letters (optionally with a trailing dot), or RU-style patronymic suffix + pIdx := -1 + for i, p := range ps { + if isInitial(p.raw) || isPatronymicLatin(p.lo) { + pIdx = i + break + } + } + + // 2) Surname: look for common last-name suffixes among remaining tokens + lIdx := -1 + for i, p := range ps { + if i == pIdx { + continue + } + if looksLikeSurnameLatin(p.lo) { + lIdx = i + break + } + } + + // 3) Assign the rest to first name; tie-break if needed + rem := make([]int, 0, 2) + for i := range ps { + if i != pIdx && i != lIdx { + rem = append(rem, i) + } + } + + // If surname not obvious and we have 2 leftovers, pick the longer one as surname + if lIdx == -1 && len(rem) == 2 { + if runeLen(ps[rem[0]].raw) >= runeLen(ps[rem[1]].raw) { + lIdx = rem[0] + rem = rem[1:] + } else { + lIdx = rem[1] + rem = rem[:1] + } + } + + out := Parts{} + if pIdx != -1 { + out.Patronymic = strings.TrimSuffix(ps[pIdx].raw, ".") + } + if lIdx != -1 { + out.Last = ps[lIdx].raw + } + + // Remaining becomes first name; if still empty (2 tokens), pick the non-surname/non-patronymic as first + if len(rem) == 1 { + out.First = ps[rem[0]].raw + } else if len(ps) == 2 { + for i := range ps { + if i != pIdx && i != lIdx { + out.First = ps[i].raw + } + } + } + + // Normalize to Title Case (capitalize first letter, lowercase rest) + out.First = capWord(out.First) + out.Last = capWord(out.Last) + out.Patronymic = strings.ToUpper(out.Patronymic) // keep initials uppercase + + // Sanity + if out.First == "" || out.Last == "" { + return out, errors.New("unable to classify parts") + } + return out, nil +} + +func tokenizeLatin(s string) []string { + // keep letters, apostrophes, hyphens; allow an optional trailing dot for initials + re := regexp.MustCompile(`(?i)[a-z]+(?:['-][a-z]+)*\.?`) + return re.FindAllString(s, -1) +} + +func isInitial(x string) bool { + x = strings.TrimSuffix(x, ".") + r := []rune(x) + return len(r) >= 1 && len(r) <= 2 && allASCIIAlpha(r) +} + +func isPatronymicLatin(lo string) bool { + // Latin transliterations of RU patronymics (very rough) + sufs := []string{"ovich", "evich", "ich", "ovna", "evna", "ichna", "ogly", "kyzy"} + for _, s := range sufs { + if strings.HasSuffix(lo, s) && len(lo) >= len(s)+2 { + return true + } + } + return false +} + +func looksLikeSurnameLatin(lo string) bool { + // Common Slavic surname endings (male & female forms) + sufs := []string{ + "ov", "ev", "in", "ina", "ova", "eva", + "sky", "skiy", "skii", "skaya", "ska", + "enko", "ienko", + "uk", "yk", "chuk", "czuk", + "yan", "ian", + "dze", "dze", "shvili", + } + for _, s := range sufs { + if strings.HasSuffix(lo, s) { + return true + } + } + // If token contains an apostrophe mid-word (e.g., emel'yanova), still may be a surname + if strings.Contains(lo, "'") { + // feminine -'yanova/-'eva etc. + if strings.HasSuffix(lo, "yanova") || strings.HasSuffix(lo, "yanov") || strings.HasSuffix(lo, "eva") || strings.HasSuffix(lo, "ova") { + return true + } + } + return false +} + +func capWord(s string) string { + if s == "" { + return s + } + // keep internal hyphens/apostrophes, title-case each segment + sep := func(r rune) bool { return r == '-' || r == '\'' } + parts := strings.FieldsFunc(strings.ToLower(s), sep) + i := 0 + builder := strings.Builder{} + for _, r := range s { + if r == '-' || r == '\'' { + builder.WriteRune(r) + continue + } + // find which sub-part this rune belongs to by counting letters consumed + if len(parts) == 0 { + builder.WriteRune(unicode.ToUpper(r)) + continue + } + if i == 0 { + builder.WriteRune(unicode.ToUpper(r)) + } else { + builder.WriteRune(unicode.ToLower(r)) + } + i++ + // crude reset at separators handled above + } + // Simpler/robust alternative: + // return strings.Title(strings.ToLower(s)) // deprecated but OK for ASCII; avoided here. + return strings.ToUpper(string([]rune(s)[0])) + strings.ToLower(s[1:]) +} + +func allASCIIAlpha(r []rune) bool { + for _, ch := range r { + if ch < 'A' || (ch > 'Z' && ch < 'a') || ch > 'z' { + return false + } + } + return true +} + +func runeLen(s string) int { return len([]rune(s)) } diff --git a/pkg/names/gender.go b/pkg/names/gender.go new file mode 100644 index 0000000..bdd16e1 --- /dev/null +++ b/pkg/names/gender.go @@ -0,0 +1,47 @@ +package names + +import ( + "airlines/pkg/model" + "strings" + "unicode" +) + +func normalizeTitle(x string) string { + x = strings.ToLower(x) + // strip common punctuation + x = strings.ReplaceAll(x, ".", "") + x = strings.ReplaceAll(x, "'", "") + x = strings.ReplaceAll(x, "’", "") + x = strings.ReplaceAll(x, "-", "") + return x +} + +func GenderFromTitle(s string) model.Sex { + if s == "" { + return model.SexUnknown + } + // only first token (before space/comma/slash/etc.) + cut := func(r rune) bool { return unicode.IsSpace(r) || r == ',' || r == '/' || r == '&' } + first := strings.FieldsFunc(s, cut) + if len(first) == 0 { + return model.SexUnknown + } + t := normalizeTitle(first[0]) + + // male honorifics + switch t { + case "mr", "sir", "lord", "monsieur", "m", "don", "senor", "sr": // "sr" may collide with "senior"; context needed + return model.SexMale + } + + // female honorifics + switch t { + case "mrs", "miss", "ms", "madam", "madame", "mademoiselle", "mlle", + "lady", "dame", "senora", "sra", "señora", "srta", "srita", "dona": + return model.SexFemale + } + + // neutral/ambiguous titles (return Unknown) + // e.g., "mx", "dr", "prof", "rev", "coach", "officer", etc. + return model.SexUnknown +} diff --git a/pkg/store/db.go b/pkg/store/db.go index 802b4ec..648bca8 100644 --- a/pkg/store/db.go +++ b/pkg/store/db.go @@ -10,6 +10,7 @@ import ( "airlines/pkg/model" "gorm.io/driver/postgres" + _ "gorm.io/driver/sqlite" "gorm.io/gorm" "gorm.io/gorm/clause" "gorm.io/gorm/logger" |
