package names import ( "errors" "regexp" "strings" "unicode" ) type Parts struct { First string Last string Patronymic string // may be "" or an initial like "F" } // ParseLatinName parses 2–3 tokens containing First/Last and optional patronymic (1–2 letters). // Tokens may be in any order, e.g. "PETROVSKAYA KARINA" or "RUSLAN F EVSEEV". func ParseLatinName(s string) (Parts, error) { toks := tokenizeLatin(s) // keeps letters, apostrophes, hyphens, optional trailing dot if len(toks) < 2 || len(toks) > 3 { return Parts{}, errors.New("expecting 2 or 3 name parts") } type part struct { raw string lo string } ps := make([]part, 0, len(toks)) for _, t := range toks { lo := strings.ToLower(strings.TrimSuffix(t, ".")) ps = append(ps, part{raw: t, lo: lo}) } // 1) Patronymic: 1–2 letters (optionally with a trailing dot), or RU-style patronymic suffix pIdx := -1 for i, p := range ps { if isInitial(p.raw) || isPatronymicLatin(p.lo) { pIdx = i break } } // 2) Surname: look for common last-name suffixes among remaining tokens lIdx := -1 for i, p := range ps { if i == pIdx { continue } if looksLikeSurnameLatin(p.lo) { lIdx = i break } } // 3) Assign the rest to first name; tie-break if needed rem := make([]int, 0, 2) for i := range ps { if i != pIdx && i != lIdx { rem = append(rem, i) } } // If surname not obvious and we have 2 leftovers, pick the longer one as surname if lIdx == -1 && len(rem) == 2 { if runeLen(ps[rem[0]].raw) >= runeLen(ps[rem[1]].raw) { lIdx = rem[0] rem = rem[1:] } else { lIdx = rem[1] rem = rem[:1] } } out := Parts{} if pIdx != -1 { out.Patronymic = strings.TrimSuffix(ps[pIdx].raw, ".") } if lIdx != -1 { out.Last = ps[lIdx].raw } // Remaining becomes first name; if still empty (2 tokens), pick the non-surname/non-patronymic as first if len(rem) == 1 { out.First = ps[rem[0]].raw } else if len(ps) == 2 { for i := range ps { if i != pIdx && i != lIdx { out.First = ps[i].raw } } } // Normalize to Title Case (capitalize first letter, lowercase rest) out.First = capWord(out.First) out.Last = capWord(out.Last) out.Patronymic = strings.ToUpper(out.Patronymic) // keep initials uppercase // Sanity if out.First == "" || out.Last == "" { return out, errors.New("unable to classify parts") } return out, nil } func tokenizeLatin(s string) []string { // keep letters, apostrophes, hyphens; allow an optional trailing dot for initials re := regexp.MustCompile(`(?i)[a-z]+(?:['-][a-z]+)*\.?`) return re.FindAllString(s, -1) } func isInitial(x string) bool { x = strings.TrimSuffix(x, ".") r := []rune(x) return len(r) >= 1 && len(r) <= 2 && allASCIIAlpha(r) } func isPatronymicLatin(lo string) bool { // Latin transliterations of RU patronymics (very rough) sufs := []string{"ovich", "evich", "ich", "ovna", "evna", "ichna", "ogly", "kyzy"} for _, s := range sufs { if strings.HasSuffix(lo, s) && len(lo) >= len(s)+2 { return true } } return false } func looksLikeSurnameLatin(lo string) bool { // Common Slavic surname endings (male & female forms) sufs := []string{ "ov", "ev", "in", "ina", "ova", "eva", "sky", "skiy", "skii", "skaya", "ska", "enko", "ienko", "uk", "yk", "chuk", "czuk", "yan", "ian", "dze", "dze", "shvili", } for _, s := range sufs { if strings.HasSuffix(lo, s) { return true } } // If token contains an apostrophe mid-word (e.g., emel'yanova), still may be a surname if strings.Contains(lo, "'") { // feminine -'yanova/-'eva etc. if strings.HasSuffix(lo, "yanova") || strings.HasSuffix(lo, "yanov") || strings.HasSuffix(lo, "eva") || strings.HasSuffix(lo, "ova") { return true } } return false } func capWord(s string) string { if s == "" { return s } // keep internal hyphens/apostrophes, title-case each segment sep := func(r rune) bool { return r == '-' || r == '\'' } parts := strings.FieldsFunc(strings.ToLower(s), sep) i := 0 builder := strings.Builder{} for _, r := range s { if r == '-' || r == '\'' { builder.WriteRune(r) continue } // find which sub-part this rune belongs to by counting letters consumed if len(parts) == 0 { builder.WriteRune(unicode.ToUpper(r)) continue } if i == 0 { builder.WriteRune(unicode.ToUpper(r)) } else { builder.WriteRune(unicode.ToLower(r)) } i++ // crude reset at separators handled above } // Simpler/robust alternative: // return strings.Title(strings.ToLower(s)) // deprecated but OK for ASCII; avoided here. return strings.ToUpper(string([]rune(s)[0])) + strings.ToLower(s[1:]) } func allASCIIAlpha(r []rune) bool { for _, ch := range r { if ch < 'A' || (ch > 'Z' && ch < 'a') || ch > 'z' { return false } } return true } func runeLen(s string) int { return len([]rune(s)) }