package names import ( "errors" "regexp" "strings" "unicode" ) type Parts struct { First string Last string Patronymic string // may be "" or an initial like "F" } func ParseLatinName(s string) (Parts, error) { toks := tokenizeLatin(s) // keeps letters, apostrophes, hyphens, optional trailing dot if len(toks) < 2 || len(toks) > 3 { return Parts{}, errors.New("expecting 2 or 3 name parts") } type part struct { raw string lo string } ps := make([]part, 0, len(toks)) for _, t := range toks { lo := strings.ToLower(strings.TrimSuffix(t, ".")) ps = append(ps, part{raw: t, lo: lo}) } // fathersname pIdx := -1 for i, p := range ps { if isInitial(p.raw) || isFathersnameLatin(p.lo) { pIdx = i break } } // Surname lIdx := -1 for i, p := range ps { if i == pIdx { continue } if looksLikeSurnameLatin(p.lo) { lIdx = i break } } // firs name rem := make([]int, 0, 2) for i := range ps { if i != pIdx && i != lIdx { rem = append(rem, i) } } // if surname not obvious and we have 2 leftovers, pick the longer one as surname ;) if lIdx == -1 && len(rem) == 2 { if runeLen(ps[rem[0]].raw) >= runeLen(ps[rem[1]].raw) { lIdx = rem[0] rem = rem[1:] } else { lIdx = rem[1] rem = rem[:1] } } out := Parts{} if pIdx != -1 { out.Patronymic = strings.TrimSuffix(ps[pIdx].raw, ".") } if lIdx != -1 { out.Last = ps[lIdx].raw } // remaining becomes first name if len(rem) == 1 { out.First = ps[rem[0]].raw } else if len(ps) == 2 { for i := range ps { if i != pIdx && i != lIdx { out.First = ps[i].raw } } } // normalize to Title Case out.First = capWord(out.First) out.Last = capWord(out.Last) out.Patronymic = strings.ToUpper(out.Patronymic) // keep initials uppercase // not found ;( if out.First == "" || out.Last == "" { return out, errors.New("unable to classify parts") } return out, nil } func tokenizeLatin(s string) []string { re := regexp.MustCompile(`(?i)[a-z]+(?:['-][a-z]+)*\.?`) return re.FindAllString(s, -1) } func isInitial(x string) bool { x = strings.TrimSuffix(x, ".") r := []rune(x) return len(r) >= 1 && len(r) <= 2 && allASCIIAlpha(r) } func isFathersnameLatin(lo string) bool { sufs := []string{"ovich", "evich", "ich", "ovna", "evna", "ichna", "ogly", "kyzy"} for _, s := range sufs { if strings.HasSuffix(lo, s) && len(lo) >= len(s)+2 { return true } } return false } func looksLikeSurnameLatin(lo string) bool { sufs := []string{ "ov", "ev", "in", "ina", "ova", "eva", "sky", "skiy", "skii", "skaya", "ska", "enko", "ienko", "uk", "yk", "chuk", "czuk", "yan", "ian", "dze", "dze", "shvili", } for _, s := range sufs { if strings.HasSuffix(lo, s) { return true } } if strings.Contains(lo, "'") { if strings.HasSuffix(lo, "yanova") || strings.HasSuffix(lo, "yanov") || strings.HasSuffix(lo, "eva") || strings.HasSuffix(lo, "ova") { return true } } return false } func capWord(s string) string { if s == "" { return s } sep := func(r rune) bool { return r == '-' || r == '\'' } parts := strings.FieldsFunc(strings.ToLower(s), sep) i := 0 builder := strings.Builder{} for _, r := range s { if r == '-' || r == '\'' { builder.WriteRune(r) continue } if len(parts) == 0 { builder.WriteRune(unicode.ToUpper(r)) continue } if i == 0 { builder.WriteRune(unicode.ToUpper(r)) } else { builder.WriteRune(unicode.ToLower(r)) } i++ } return strings.ToUpper(string([]rune(s)[0])) + strings.ToLower(s[1:]) } func allASCIIAlpha(r []rune) bool { for _, ch := range r { if ch < 'A' || (ch > 'Z' && ch < 'a') || ch > 'z' { return false } } return true } func runeLen(s string) int { return len([]rune(s)) }