diff options
Diffstat (limited to 'pkg/names')
| -rw-r--r-- | pkg/names/fio.go | 32 | ||||
| -rw-r--r-- | pkg/names/gender.go | 12 |
2 files changed, 16 insertions, 28 deletions
diff --git a/pkg/names/fio.go b/pkg/names/fio.go index 4ecca7e..0b83633 100644 --- a/pkg/names/fio.go +++ b/pkg/names/fio.go @@ -13,8 +13,6 @@ type Parts struct { Patronymic string // may be "" or an initial like "F" } -// ParseLatinName parses 2–3 tokens containing First/Last and optional patronymic (1–2 letters). -// Tokens may be in any order, e.g. "PETROVSKAYA KARINA" or "RUSLAN F EVSEEV". func ParseLatinName(s string) (Parts, error) { toks := tokenizeLatin(s) // keeps letters, apostrophes, hyphens, optional trailing dot if len(toks) < 2 || len(toks) > 3 { @@ -31,16 +29,16 @@ func ParseLatinName(s string) (Parts, error) { ps = append(ps, part{raw: t, lo: lo}) } - // 1) Patronymic: 1–2 letters (optionally with a trailing dot), or RU-style patronymic suffix + // fathersname pIdx := -1 for i, p := range ps { - if isInitial(p.raw) || isPatronymicLatin(p.lo) { + if isInitial(p.raw) || isFathersnameLatin(p.lo) { pIdx = i break } } - // 2) Surname: look for common last-name suffixes among remaining tokens + // Surname lIdx := -1 for i, p := range ps { if i == pIdx { @@ -52,7 +50,7 @@ func ParseLatinName(s string) (Parts, error) { } } - // 3) Assign the rest to first name; tie-break if needed + // firs name rem := make([]int, 0, 2) for i := range ps { if i != pIdx && i != lIdx { @@ -60,7 +58,7 @@ func ParseLatinName(s string) (Parts, error) { } } - // If surname not obvious and we have 2 leftovers, pick the longer one as surname + // if surname not obvious and we have 2 leftovers, pick the longer one as surname ;) if lIdx == -1 && len(rem) == 2 { if runeLen(ps[rem[0]].raw) >= runeLen(ps[rem[1]].raw) { lIdx = rem[0] @@ -79,7 +77,7 @@ func ParseLatinName(s string) (Parts, error) { out.Last = ps[lIdx].raw } - // Remaining becomes first name; if still empty (2 tokens), pick the non-surname/non-patronymic as first + // remaining becomes first name if len(rem) == 1 { out.First = ps[rem[0]].raw } else if len(ps) == 2 { @@ -90,12 +88,12 @@ func ParseLatinName(s string) (Parts, error) { } } - // Normalize to Title Case (capitalize first letter, lowercase rest) + // normalize to Title Case out.First = capWord(out.First) out.Last = capWord(out.Last) out.Patronymic = strings.ToUpper(out.Patronymic) // keep initials uppercase - // Sanity + // not found ;( if out.First == "" || out.Last == "" { return out, errors.New("unable to classify parts") } @@ -103,7 +101,6 @@ func ParseLatinName(s string) (Parts, error) { } func tokenizeLatin(s string) []string { - // keep letters, apostrophes, hyphens; allow an optional trailing dot for initials re := regexp.MustCompile(`(?i)[a-z]+(?:['-][a-z]+)*\.?`) return re.FindAllString(s, -1) } @@ -114,8 +111,7 @@ func isInitial(x string) bool { return len(r) >= 1 && len(r) <= 2 && allASCIIAlpha(r) } -func isPatronymicLatin(lo string) bool { - // Latin transliterations of RU patronymics (very rough) +func isFathersnameLatin(lo string) bool { sufs := []string{"ovich", "evich", "ich", "ovna", "evna", "ichna", "ogly", "kyzy"} for _, s := range sufs { if strings.HasSuffix(lo, s) && len(lo) >= len(s)+2 { @@ -126,7 +122,6 @@ func isPatronymicLatin(lo string) bool { } func looksLikeSurnameLatin(lo string) bool { - // Common Slavic surname endings (male & female forms) sufs := []string{ "ov", "ev", "in", "ina", "ova", "eva", "sky", "skiy", "skii", "skaya", "ska", @@ -140,9 +135,7 @@ func looksLikeSurnameLatin(lo string) bool { return true } } - // If token contains an apostrophe mid-word (e.g., emel'yanova), still may be a surname if strings.Contains(lo, "'") { - // feminine -'yanova/-'eva etc. if strings.HasSuffix(lo, "yanova") || strings.HasSuffix(lo, "yanov") || strings.HasSuffix(lo, "eva") || strings.HasSuffix(lo, "ova") { return true } @@ -154,7 +147,6 @@ func capWord(s string) string { if s == "" { return s } - // keep internal hyphens/apostrophes, title-case each segment sep := func(r rune) bool { return r == '-' || r == '\'' } parts := strings.FieldsFunc(strings.ToLower(s), sep) i := 0 @@ -164,7 +156,7 @@ func capWord(s string) string { builder.WriteRune(r) continue } - // find which sub-part this rune belongs to by counting letters consumed + if len(parts) == 0 { builder.WriteRune(unicode.ToUpper(r)) continue @@ -175,10 +167,8 @@ func capWord(s string) string { builder.WriteRune(unicode.ToLower(r)) } i++ - // crude reset at separators handled above } - // Simpler/robust alternative: - // return strings.Title(strings.ToLower(s)) // deprecated but OK for ASCII; avoided here. + return strings.ToUpper(string([]rune(s)[0])) + strings.ToLower(s[1:]) } diff --git a/pkg/names/gender.go b/pkg/names/gender.go index bdd16e1..4282d9d 100644 --- a/pkg/names/gender.go +++ b/pkg/names/gender.go @@ -8,7 +8,7 @@ import ( func normalizeTitle(x string) string { x = strings.ToLower(x) - // strip common punctuation + // strip punctuation x = strings.ReplaceAll(x, ".", "") x = strings.ReplaceAll(x, "'", "") x = strings.ReplaceAll(x, "’", "") @@ -20,7 +20,6 @@ func GenderFromTitle(s string) model.Sex { if s == "" { return model.SexUnknown } - // only first token (before space/comma/slash/etc.) cut := func(r rune) bool { return unicode.IsSpace(r) || r == ',' || r == '/' || r == '&' } first := strings.FieldsFunc(s, cut) if len(first) == 0 { @@ -28,20 +27,19 @@ func GenderFromTitle(s string) model.Sex { } t := normalizeTitle(first[0]) - // male honorifics + // male switch t { - case "mr", "sir", "lord", "monsieur", "m", "don", "senor", "sr": // "sr" may collide with "senior"; context needed + case "mr", "sir", "lord", "monsieur", "m", "don", "senor", "sr": return model.SexMale } - // female honorifics + // female switch t { case "mrs", "miss", "ms", "madam", "madame", "mademoiselle", "mlle", "lady", "dame", "senora", "sra", "señora", "srta", "srita", "dona": return model.SexFemale } - // neutral/ambiguous titles (return Unknown) - // e.g., "mx", "dr", "prof", "rev", "coach", "officer", etc. + // flying helicopter return model.SexUnknown } |
