diff --git a/api/genre_normalize.go b/api/genre_normalize.go new file mode 100644 index 00000000..885f8cf4 --- /dev/null +++ b/api/genre_normalize.go @@ -0,0 +1,98 @@ +package api + +import ( + "strings" + "unicode" +) + +// canonicalGenres maps a collapsed lookup key (see genreKey) to the canonical +// display form for genres that must not be plain title-cased — either because +// they merge separator variants ("hip-hop"/"hiphop" -> "Hip-Hop/Rap") or +// because their conventional casing is not title case (R&B/Soul, EDM, ...). +// +// The canonical output forms mirror go-openaudio's GenreAllowlist +// (pkg/etl/processors/entity_manager/genre_allowlist.go), which is the +// protocol's source of truth for canonical genre spelling. Keeping these in +// sync means the API's normalized output agrees with the form the upstream ETL +// indexer (the genre write path) treats as canonical. +// +// The key is the lowercased, alphanumeric-only form of the input, so every +// punctuation/spacing variant of a genre maps through the same entry: +// "R&B", "r & b", "rnb", "R&B/Soul" all collapse to "R&B/Soul". +var canonicalGenres = map[string]string{ + // Allowlist genres whose canonical spelling differs from naive title case. + "hiphop": "Hip-Hop/Rap", // "Hip Hop", "hip-hop", "hiphop" + "hiphoprap": "Hip-Hop/Rap", // "Hip-Hop/Rap", "hip hop rap" + "rb": "R&B/Soul", // "r&b", "r & b" + "rnb": "R&B/Soul", // "rnb" + "randb": "R&B/Soul", // "r and b" + "rbsoul": "R&B/Soul", // "R&B/Soul" + "rnbsoul": "R&B/Soul", // "rnb/soul" + "dnb": "Drum & Bass", + "drumandbass": "Drum & Bass", + "drumbass": "Drum & Bass", // "Drum & Bass" itself + "lofi": "Lo-Fi", + + // Acronyms not in the allowlist, kept only to preserve casing (so they are + // not title-cased to "Edm"/"Dj"). + "edm": "EDM", + "dj": "DJ", +} + +// genreKey reduces a genre string to a comparison key: lowercased and stripped +// of every non-alphanumeric rune. This is what makes "Hip Hop", "hip-hop", and +// "hiphop" indistinguishable, so variants collapse to a single canonical form. +func genreKey(s string) string { + var b strings.Builder + for _, r := range strings.ToLower(s) { + if unicode.IsLetter(r) || unicode.IsDigit(r) { + b.WriteRune(r) + } + } + return b.String() +} + +// NormalizeGenre collapses genre variants to a single canonical form: +// - trims surrounding whitespace and collapses internal whitespace runs +// - maps known special cases via canonicalGenres (R&B/Soul, EDM, +// Hip-Hop/Rap, ...) +// - otherwise title-cases the value, preserving internal separators +// ("deep house" -> "Deep House") +// +// Already-canonical allowlist values pass through unchanged (e.g. +// "Electronic", "R&B/Soul", "Hip-Hop/Rap"). An empty/whitespace-only input +// returns "". +func NormalizeGenre(s string) string { + trimmed := strings.TrimSpace(s) + if trimmed == "" { + return "" + } + if canon, ok := canonicalGenres[genreKey(trimmed)]; ok { + return canon + } + return titleCaseGenre(trimmed) +} + +// titleCaseGenre upper-cases the first letter of each alphabetic run and +// lower-cases the rest, leaving non-letter separators in place. Internal +// whitespace runs are collapsed to a single space. So "ELECTRONIC" -> +// "Electronic", "deep house" -> "Deep House", "hip-hop/rap" -> "Hip-Hop/Rap". +func titleCaseGenre(s string) string { + s = strings.Join(strings.Fields(s), " ") + var b strings.Builder + prevLetter := false + for _, r := range s { + if unicode.IsLetter(r) { + if prevLetter { + b.WriteRune(unicode.ToLower(r)) + } else { + b.WriteRune(unicode.ToUpper(r)) + } + prevLetter = true + } else { + b.WriteRune(r) + prevLetter = false + } + } + return b.String() +} diff --git a/api/genre_normalize_test.go b/api/genre_normalize_test.go new file mode 100644 index 00000000..29fef6e3 --- /dev/null +++ b/api/genre_normalize_test.go @@ -0,0 +1,60 @@ +package api + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestGenreNormalize(t *testing.T) { + cases := []struct { + name string + in string + want string + }{ + // trimming + {"trims surrounding whitespace", " Electronic ", "Electronic"}, + {"collapses internal whitespace", "Deep House", "Deep House"}, + {"empty stays empty", "", ""}, + {"whitespace-only stays empty", " ", ""}, + + // casing + {"lowercases to title case", "electronic", "Electronic"}, + {"uppercases to title case", "ELECTRONIC", "Electronic"}, + {"mixed case to title case", "eLeCtRoNiC", "Electronic"}, + {"multi-word title case", "deep house", "Deep House"}, + + // hip-hop / hiphop variants collapse to "Hip Hop" + // hip-hop / hiphop variants collapse to the allowlist form "Hip-Hop/Rap" + {"hyphenated hip-hop", "hip-hop", "Hip-Hop/Rap"}, + {"squashed hiphop", "hiphop", "Hip-Hop/Rap"}, + {"spaced hip hop lowercase", "hip hop", "Hip-Hop/Rap"}, + {"uppercase HIP-HOP", "HIP-HOP", "Hip-Hop/Rap"}, + + // r&b variants collapse to the allowlist form "R&B/Soul" + {"r&b lowercase", "r&b", "R&B/Soul"}, + {"rnb", "rnb", "R&B/Soul"}, + {"r & b spaced", "R & B", "R&B/Soul"}, + + // other special cases keep conventional casing + {"edm", "edm", "EDM"}, + {"dj", "DJ", "DJ"}, + {"drum and bass", "drum and bass", "Drum & Bass"}, + {"dnb", "dnb", "Drum & Bass"}, + + // already-canonical allowlist values pass through unchanged + {"Electronic unchanged", "Electronic", "Electronic"}, + {"Hip-Hop/Rap unchanged", "Hip-Hop/Rap", "Hip-Hop/Rap"}, + {"R&B/Soul unchanged", "R&B/Soul", "R&B/Soul"}, + {"Drum & Bass unchanged", "Drum & Bass", "Drum & Bass"}, + {"Lo-Fi unchanged", "Lo-Fi", "Lo-Fi"}, + {"Deep House unchanged", "Deep House", "Deep House"}, + {"EDM unchanged", "EDM", "EDM"}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.want, NormalizeGenre(tc.in), "NormalizeGenre(%q)", tc.in) + }) + } +} diff --git a/api/v1_genres_popular.go b/api/v1_genres_popular.go index 8acbd7f0..6045a224 100644 --- a/api/v1_genres_popular.go +++ b/api/v1_genres_popular.go @@ -1,6 +1,7 @@ package api import ( + "sort" "time" "api.audius.co/api/dbv1" @@ -36,19 +37,47 @@ func (app *ApiServer) v1GenresPopular(c *fiber.Ctx) error { return err } + // Genre values are written upstream (by the discovery provider) and are not + // normalized at rest, so the same logical genre can appear under several + // spellings ("Hip Hop", "hip-hop", "hiphop"). Collapse those variants to a + // canonical name here and sum their counts. NOTE: the SQL groups + paginates + // on the raw genre, so this only merges variants that land within the same + // page; the durable fix is to normalize genre on write upstream. + indexByName := make(map[string]int, len(genres)) result := make([]PopularGenre, 0, len(genres)) for _, genre := range genres { - if genre.Count < int64(params.MinCount) { + name := NormalizeGenre(genre.Genre.String) + if name == "" { continue } + if i, ok := indexByName[name]; ok { + result[i].Count += genre.Count + continue + } + + indexByName[name] = len(result) result = append(result, PopularGenre{ - Name: genre.Genre.String, + Name: name, Count: genre.Count, }) } + // Re-sort because merging variants can change the relative ordering, then + // drop anything below min_count (evaluated against the merged total). + sort.SliceStable(result, func(i, j int) bool { + return result[i].Count > result[j].Count + }) + + filtered := result[:0] + for _, g := range result { + if g.Count < int64(params.MinCount) { + continue + } + filtered = append(filtered, g) + } + return c.JSON(fiber.Map{ - "data": result, + "data": filtered, }) }