From 091963a50c3bb2926f559f01c49e8f5bd03d2bfd Mon Sep 17 00:00:00 2001 From: leshe4ka46 Date: Sun, 19 Oct 2025 13:08:03 +0300 Subject: xlsx --- pkg/names/fio.go | 194 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 194 insertions(+) create mode 100644 pkg/names/fio.go (limited to 'pkg/names/fio.go') diff --git a/pkg/names/fio.go b/pkg/names/fio.go new file mode 100644 index 0000000..4ecca7e --- /dev/null +++ b/pkg/names/fio.go @@ -0,0 +1,194 @@ +package names + +import ( + "errors" + "regexp" + "strings" + "unicode" +) + +type Parts struct { + First string + Last string + Patronymic string // may be "" or an initial like "F" +} + +// ParseLatinName parses 2–3 tokens containing First/Last and optional patronymic (1–2 letters). +// Tokens may be in any order, e.g. "PETROVSKAYA KARINA" or "RUSLAN F EVSEEV". +func ParseLatinName(s string) (Parts, error) { + toks := tokenizeLatin(s) // keeps letters, apostrophes, hyphens, optional trailing dot + if len(toks) < 2 || len(toks) > 3 { + return Parts{}, errors.New("expecting 2 or 3 name parts") + } + + type part struct { + raw string + lo string + } + ps := make([]part, 0, len(toks)) + for _, t := range toks { + lo := strings.ToLower(strings.TrimSuffix(t, ".")) + ps = append(ps, part{raw: t, lo: lo}) + } + + // 1) Patronymic: 1–2 letters (optionally with a trailing dot), or RU-style patronymic suffix + pIdx := -1 + for i, p := range ps { + if isInitial(p.raw) || isPatronymicLatin(p.lo) { + pIdx = i + break + } + } + + // 2) Surname: look for common last-name suffixes among remaining tokens + lIdx := -1 + for i, p := range ps { + if i == pIdx { + continue + } + if looksLikeSurnameLatin(p.lo) { + lIdx = i + break + } + } + + // 3) Assign the rest to first name; tie-break if needed + rem := make([]int, 0, 2) + for i := range ps { + if i != pIdx && i != lIdx { + rem = append(rem, i) + } + } + + // If surname not obvious and we have 2 leftovers, pick the longer one as surname + if lIdx == -1 && len(rem) == 2 { + if runeLen(ps[rem[0]].raw) >= runeLen(ps[rem[1]].raw) { + lIdx = rem[0] + rem = rem[1:] + } else { + lIdx = rem[1] + rem = rem[:1] + } + } + + out := Parts{} + if pIdx != -1 { + out.Patronymic = strings.TrimSuffix(ps[pIdx].raw, ".") + } + if lIdx != -1 { + out.Last = ps[lIdx].raw + } + + // Remaining becomes first name; if still empty (2 tokens), pick the non-surname/non-patronymic as first + if len(rem) == 1 { + out.First = ps[rem[0]].raw + } else if len(ps) == 2 { + for i := range ps { + if i != pIdx && i != lIdx { + out.First = ps[i].raw + } + } + } + + // Normalize to Title Case (capitalize first letter, lowercase rest) + out.First = capWord(out.First) + out.Last = capWord(out.Last) + out.Patronymic = strings.ToUpper(out.Patronymic) // keep initials uppercase + + // Sanity + if out.First == "" || out.Last == "" { + return out, errors.New("unable to classify parts") + } + return out, nil +} + +func tokenizeLatin(s string) []string { + // keep letters, apostrophes, hyphens; allow an optional trailing dot for initials + re := regexp.MustCompile(`(?i)[a-z]+(?:['-][a-z]+)*\.?`) + return re.FindAllString(s, -1) +} + +func isInitial(x string) bool { + x = strings.TrimSuffix(x, ".") + r := []rune(x) + return len(r) >= 1 && len(r) <= 2 && allASCIIAlpha(r) +} + +func isPatronymicLatin(lo string) bool { + // Latin transliterations of RU patronymics (very rough) + sufs := []string{"ovich", "evich", "ich", "ovna", "evna", "ichna", "ogly", "kyzy"} + for _, s := range sufs { + if strings.HasSuffix(lo, s) && len(lo) >= len(s)+2 { + return true + } + } + return false +} + +func looksLikeSurnameLatin(lo string) bool { + // Common Slavic surname endings (male & female forms) + sufs := []string{ + "ov", "ev", "in", "ina", "ova", "eva", + "sky", "skiy", "skii", "skaya", "ska", + "enko", "ienko", + "uk", "yk", "chuk", "czuk", + "yan", "ian", + "dze", "dze", "shvili", + } + for _, s := range sufs { + if strings.HasSuffix(lo, s) { + return true + } + } + // If token contains an apostrophe mid-word (e.g., emel'yanova), still may be a surname + if strings.Contains(lo, "'") { + // feminine -'yanova/-'eva etc. + if strings.HasSuffix(lo, "yanova") || strings.HasSuffix(lo, "yanov") || strings.HasSuffix(lo, "eva") || strings.HasSuffix(lo, "ova") { + return true + } + } + return false +} + +func capWord(s string) string { + if s == "" { + return s + } + // keep internal hyphens/apostrophes, title-case each segment + sep := func(r rune) bool { return r == '-' || r == '\'' } + parts := strings.FieldsFunc(strings.ToLower(s), sep) + i := 0 + builder := strings.Builder{} + for _, r := range s { + if r == '-' || r == '\'' { + builder.WriteRune(r) + continue + } + // find which sub-part this rune belongs to by counting letters consumed + if len(parts) == 0 { + builder.WriteRune(unicode.ToUpper(r)) + continue + } + if i == 0 { + builder.WriteRune(unicode.ToUpper(r)) + } else { + builder.WriteRune(unicode.ToLower(r)) + } + i++ + // crude reset at separators handled above + } + // Simpler/robust alternative: + // return strings.Title(strings.ToLower(s)) // deprecated but OK for ASCII; avoided here. + return strings.ToUpper(string([]rune(s)[0])) + strings.ToLower(s[1:]) +} + +func allASCIIAlpha(r []rune) bool { + for _, ch := range r { + if ch < 'A' || (ch > 'Z' && ch < 'a') || ch > 'z' { + return false + } + } + return true +} + +func runeLen(s string) int { return len([]rune(s)) } -- cgit v1.2.3