aboutsummaryrefslogtreecommitdiff
path: root/pkg/names
diff options
context:
space:
mode:
Diffstat (limited to 'pkg/names')
-rw-r--r--pkg/names/fio.go32
-rw-r--r--pkg/names/gender.go12
2 files changed, 16 insertions, 28 deletions
diff --git a/pkg/names/fio.go b/pkg/names/fio.go
index 4ecca7e..0b83633 100644
--- a/pkg/names/fio.go
+++ b/pkg/names/fio.go
@@ -13,8 +13,6 @@ type Parts struct {
Patronymic string // may be "" or an initial like "F"
}
-// ParseLatinName parses 2–3 tokens containing First/Last and optional patronymic (1–2 letters).
-// Tokens may be in any order, e.g. "PETROVSKAYA KARINA" or "RUSLAN F EVSEEV".
func ParseLatinName(s string) (Parts, error) {
toks := tokenizeLatin(s) // keeps letters, apostrophes, hyphens, optional trailing dot
if len(toks) < 2 || len(toks) > 3 {
@@ -31,16 +29,16 @@ func ParseLatinName(s string) (Parts, error) {
ps = append(ps, part{raw: t, lo: lo})
}
- // 1) Patronymic: 1–2 letters (optionally with a trailing dot), or RU-style patronymic suffix
+ // fathersname
pIdx := -1
for i, p := range ps {
- if isInitial(p.raw) || isPatronymicLatin(p.lo) {
+ if isInitial(p.raw) || isFathersnameLatin(p.lo) {
pIdx = i
break
}
}
- // 2) Surname: look for common last-name suffixes among remaining tokens
+ // Surname
lIdx := -1
for i, p := range ps {
if i == pIdx {
@@ -52,7 +50,7 @@ func ParseLatinName(s string) (Parts, error) {
}
}
- // 3) Assign the rest to first name; tie-break if needed
+ // firs name
rem := make([]int, 0, 2)
for i := range ps {
if i != pIdx && i != lIdx {
@@ -60,7 +58,7 @@ func ParseLatinName(s string) (Parts, error) {
}
}
- // If surname not obvious and we have 2 leftovers, pick the longer one as surname
+ // if surname not obvious and we have 2 leftovers, pick the longer one as surname ;)
if lIdx == -1 && len(rem) == 2 {
if runeLen(ps[rem[0]].raw) >= runeLen(ps[rem[1]].raw) {
lIdx = rem[0]
@@ -79,7 +77,7 @@ func ParseLatinName(s string) (Parts, error) {
out.Last = ps[lIdx].raw
}
- // Remaining becomes first name; if still empty (2 tokens), pick the non-surname/non-patronymic as first
+ // remaining becomes first name
if len(rem) == 1 {
out.First = ps[rem[0]].raw
} else if len(ps) == 2 {
@@ -90,12 +88,12 @@ func ParseLatinName(s string) (Parts, error) {
}
}
- // Normalize to Title Case (capitalize first letter, lowercase rest)
+ // normalize to Title Case
out.First = capWord(out.First)
out.Last = capWord(out.Last)
out.Patronymic = strings.ToUpper(out.Patronymic) // keep initials uppercase
- // Sanity
+ // not found ;(
if out.First == "" || out.Last == "" {
return out, errors.New("unable to classify parts")
}
@@ -103,7 +101,6 @@ func ParseLatinName(s string) (Parts, error) {
}
func tokenizeLatin(s string) []string {
- // keep letters, apostrophes, hyphens; allow an optional trailing dot for initials
re := regexp.MustCompile(`(?i)[a-z]+(?:['-][a-z]+)*\.?`)
return re.FindAllString(s, -1)
}
@@ -114,8 +111,7 @@ func isInitial(x string) bool {
return len(r) >= 1 && len(r) <= 2 && allASCIIAlpha(r)
}
-func isPatronymicLatin(lo string) bool {
- // Latin transliterations of RU patronymics (very rough)
+func isFathersnameLatin(lo string) bool {
sufs := []string{"ovich", "evich", "ich", "ovna", "evna", "ichna", "ogly", "kyzy"}
for _, s := range sufs {
if strings.HasSuffix(lo, s) && len(lo) >= len(s)+2 {
@@ -126,7 +122,6 @@ func isPatronymicLatin(lo string) bool {
}
func looksLikeSurnameLatin(lo string) bool {
- // Common Slavic surname endings (male & female forms)
sufs := []string{
"ov", "ev", "in", "ina", "ova", "eva",
"sky", "skiy", "skii", "skaya", "ska",
@@ -140,9 +135,7 @@ func looksLikeSurnameLatin(lo string) bool {
return true
}
}
- // If token contains an apostrophe mid-word (e.g., emel'yanova), still may be a surname
if strings.Contains(lo, "'") {
- // feminine -'yanova/-'eva etc.
if strings.HasSuffix(lo, "yanova") || strings.HasSuffix(lo, "yanov") || strings.HasSuffix(lo, "eva") || strings.HasSuffix(lo, "ova") {
return true
}
@@ -154,7 +147,6 @@ func capWord(s string) string {
if s == "" {
return s
}
- // keep internal hyphens/apostrophes, title-case each segment
sep := func(r rune) bool { return r == '-' || r == '\'' }
parts := strings.FieldsFunc(strings.ToLower(s), sep)
i := 0
@@ -164,7 +156,7 @@ func capWord(s string) string {
builder.WriteRune(r)
continue
}
- // find which sub-part this rune belongs to by counting letters consumed
+
if len(parts) == 0 {
builder.WriteRune(unicode.ToUpper(r))
continue
@@ -175,10 +167,8 @@ func capWord(s string) string {
builder.WriteRune(unicode.ToLower(r))
}
i++
- // crude reset at separators handled above
}
- // Simpler/robust alternative:
- // return strings.Title(strings.ToLower(s)) // deprecated but OK for ASCII; avoided here.
+
return strings.ToUpper(string([]rune(s)[0])) + strings.ToLower(s[1:])
}
diff --git a/pkg/names/gender.go b/pkg/names/gender.go
index bdd16e1..4282d9d 100644
--- a/pkg/names/gender.go
+++ b/pkg/names/gender.go
@@ -8,7 +8,7 @@ import (
func normalizeTitle(x string) string {
x = strings.ToLower(x)
- // strip common punctuation
+ // strip punctuation
x = strings.ReplaceAll(x, ".", "")
x = strings.ReplaceAll(x, "'", "")
x = strings.ReplaceAll(x, "’", "")
@@ -20,7 +20,6 @@ func GenderFromTitle(s string) model.Sex {
if s == "" {
return model.SexUnknown
}
- // only first token (before space/comma/slash/etc.)
cut := func(r rune) bool { return unicode.IsSpace(r) || r == ',' || r == '/' || r == '&' }
first := strings.FieldsFunc(s, cut)
if len(first) == 0 {
@@ -28,20 +27,19 @@ func GenderFromTitle(s string) model.Sex {
}
t := normalizeTitle(first[0])
- // male honorifics
+ // male
switch t {
- case "mr", "sir", "lord", "monsieur", "m", "don", "senor", "sr": // "sr" may collide with "senior"; context needed
+ case "mr", "sir", "lord", "monsieur", "m", "don", "senor", "sr":
return model.SexMale
}
- // female honorifics
+ // female
switch t {
case "mrs", "miss", "ms", "madam", "madame", "mademoiselle", "mlle",
"lady", "dame", "senora", "sra", "señora", "srta", "srita", "dona":
return model.SexFemale
}
- // neutral/ambiguous titles (return Unknown)
- // e.g., "mx", "dr", "prof", "rev", "coach", "officer", etc.
+ // flying helicopter
return model.SexUnknown
}