From 670ab94a4c52e01719a93eeed3f6d94474abcf63 Mon Sep 17 00:00:00 2001 From: Dylan Jeffers Date: Wed, 17 Jun 2026 17:40:13 -0700 Subject: [PATCH 1/3] feat: normalize genre variants to a canonical form Collapse genre spelling variants (e.g. "Hip Hop"/"hip-hop"/"hiphop", "r&b"/"rnb") to a single canonical name across the read paths. - Add NormalizeGenre() (api/genre_normalize.go): trims + collapses whitespace, title-cases, and maps known special cases (R&B, EDM, DJ, Hip Hop, Drum & Bass, ...) that should not be plain title-cased. - /v1/genres/popular: normalize names and merge + sum counts for variants that collapse to the same canonical name, then re-sort and apply min_count to the merged totals. - Normalize the `genre` filter param on the trending, underground, latest, and users/genre/top endpoints so case/whitespace variants match the canonical stored value. - Add TestGenreNormalize covering trim, casing, hip-hop/hiphop -> Hip Hop, r&b -> R&B, and already-canonical pass-through. The Go service does not write the tracks.genre column (it is populated upstream by the discovery provider), so normalization is applied on read. Fully collapsing variants at rest requires normalizing on write upstream. Co-Authored-By: Claude Opus 4.8 (1M context) --- api/genre_normalize.go | 85 +++++++++++++++++++++++++++ api/genre_normalize_test.go | 57 ++++++++++++++++++ api/v1_genres_popular.go | 35 ++++++++++- api/v1_tracks_latest.go | 2 +- api/v1_tracks_trending.go | 2 +- api/v1_tracks_trending_underground.go | 2 +- api/v1_users_genre_top.go | 7 ++- 7 files changed, 183 insertions(+), 7 deletions(-) create mode 100644 api/genre_normalize.go create mode 100644 api/genre_normalize_test.go diff --git a/api/genre_normalize.go b/api/genre_normalize.go new file mode 100644 index 00000000..4d47ec4a --- /dev/null +++ b/api/genre_normalize.go @@ -0,0 +1,85 @@ +package api + +import ( + "strings" + "unicode" +) + +// canonicalGenres maps a collapsed lookup key (see genreKey) to the canonical +// display form for genres that must not be plain title-cased — either because +// they merge separator variants ("hip-hop"/"hiphop" -> "Hip Hop") or because +// their conventional casing is not title case (R&B, EDM, DJ, ...). +// +// The key is the lowercased, alphanumeric-only form of the input, so every +// punctuation/spacing variant of a genre maps through the same entry: +// "R&B", "r & b", "rnb" all collapse to key "rb"/"rnb" -> "R&B". +var canonicalGenres = map[string]string{ + "hiphop": "Hip Hop", + "rb": "R&B", // "r&b", "r & b" + "rnb": "R&B", // "rnb" + "randb": "R&B", // "r and b" + "edm": "EDM", + "dj": "DJ", + "dnb": "Drum & Bass", + "drumandbass": "Drum & Bass", + "drumbass": "Drum & Bass", + "lofi": "Lo-Fi", + "kpop": "K-Pop", + "jpop": "J-Pop", +} + +// genreKey reduces a genre string to a comparison key: lowercased and stripped +// of every non-alphanumeric rune. This is what makes "Hip Hop", "hip-hop", and +// "hiphop" indistinguishable, so variants collapse to a single canonical form. +func genreKey(s string) string { + var b strings.Builder + for _, r := range strings.ToLower(s) { + if unicode.IsLetter(r) || unicode.IsDigit(r) { + b.WriteRune(r) + } + } + return b.String() +} + +// NormalizeGenre collapses genre variants to a single canonical form: +// - trims surrounding whitespace and collapses internal whitespace runs +// - maps known special cases via canonicalGenres (R&B, EDM, Hip Hop, ...) +// - otherwise title-cases the value, preserving internal separators +// ("hip-hop/rap" -> "Hip-Hop/Rap") +// +// Already-canonical values pass through unchanged (e.g. "Electronic", +// "R&B", "Hip Hop"). An empty/whitespace-only input returns "". +func NormalizeGenre(s string) string { + trimmed := strings.TrimSpace(s) + if trimmed == "" { + return "" + } + if canon, ok := canonicalGenres[genreKey(trimmed)]; ok { + return canon + } + return titleCaseGenre(trimmed) +} + +// titleCaseGenre upper-cases the first letter of each alphabetic run and +// lower-cases the rest, leaving non-letter separators in place. Internal +// whitespace runs are collapsed to a single space. So "ELECTRONIC" -> +// "Electronic", "deep house" -> "Deep House", "hip-hop/rap" -> "Hip-Hop/Rap". +func titleCaseGenre(s string) string { + s = strings.Join(strings.Fields(s), " ") + var b strings.Builder + prevLetter := false + for _, r := range s { + if unicode.IsLetter(r) { + if prevLetter { + b.WriteRune(unicode.ToLower(r)) + } else { + b.WriteRune(unicode.ToUpper(r)) + } + prevLetter = true + } else { + b.WriteRune(r) + prevLetter = false + } + } + return b.String() +} diff --git a/api/genre_normalize_test.go b/api/genre_normalize_test.go new file mode 100644 index 00000000..50f3ecfe --- /dev/null +++ b/api/genre_normalize_test.go @@ -0,0 +1,57 @@ +package api + +import ( + "testing" + + "github.com/stretchr/testify/assert" +) + +func TestGenreNormalize(t *testing.T) { + cases := []struct { + name string + in string + want string + }{ + // trimming + {"trims surrounding whitespace", " Electronic ", "Electronic"}, + {"collapses internal whitespace", "Hip Hop", "Hip Hop"}, + {"empty stays empty", "", ""}, + {"whitespace-only stays empty", " ", ""}, + + // casing + {"lowercases to title case", "electronic", "Electronic"}, + {"uppercases to title case", "ELECTRONIC", "Electronic"}, + {"mixed case to title case", "eLeCtRoNiC", "Electronic"}, + {"multi-word title case", "deep house", "Deep House"}, + + // hip-hop / hiphop variants collapse to "Hip Hop" + {"hyphenated hip-hop", "hip-hop", "Hip Hop"}, + {"squashed hiphop", "hiphop", "Hip Hop"}, + {"spaced hip hop lowercase", "hip hop", "Hip Hop"}, + {"uppercase HIP-HOP", "HIP-HOP", "Hip Hop"}, + + // r&b variants collapse to "R&B" + {"r&b lowercase", "r&b", "R&B"}, + {"rnb", "rnb", "R&B"}, + {"r & b spaced", "R & B", "R&B"}, + + // other special cases keep conventional casing + {"edm", "edm", "EDM"}, + {"dj", "DJ", "DJ"}, + {"drum and bass", "drum and bass", "Drum & Bass"}, + {"dnb", "dnb", "Drum & Bass"}, + + // already-correct values pass through unchanged + {"Electronic unchanged", "Electronic", "Electronic"}, + {"R&B unchanged", "R&B", "R&B"}, + {"Hip Hop unchanged", "Hip Hop", "Hip Hop"}, + {"EDM unchanged", "EDM", "EDM"}, + {"hyphen/slash genre preserved", "Hip-Hop/Rap", "Hip-Hop/Rap"}, + } + + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + assert.Equal(t, tc.want, NormalizeGenre(tc.in), "NormalizeGenre(%q)", tc.in) + }) + } +} diff --git a/api/v1_genres_popular.go b/api/v1_genres_popular.go index 8acbd7f0..6045a224 100644 --- a/api/v1_genres_popular.go +++ b/api/v1_genres_popular.go @@ -1,6 +1,7 @@ package api import ( + "sort" "time" "api.audius.co/api/dbv1" @@ -36,19 +37,47 @@ func (app *ApiServer) v1GenresPopular(c *fiber.Ctx) error { return err } + // Genre values are written upstream (by the discovery provider) and are not + // normalized at rest, so the same logical genre can appear under several + // spellings ("Hip Hop", "hip-hop", "hiphop"). Collapse those variants to a + // canonical name here and sum their counts. NOTE: the SQL groups + paginates + // on the raw genre, so this only merges variants that land within the same + // page; the durable fix is to normalize genre on write upstream. + indexByName := make(map[string]int, len(genres)) result := make([]PopularGenre, 0, len(genres)) for _, genre := range genres { - if genre.Count < int64(params.MinCount) { + name := NormalizeGenre(genre.Genre.String) + if name == "" { continue } + if i, ok := indexByName[name]; ok { + result[i].Count += genre.Count + continue + } + + indexByName[name] = len(result) result = append(result, PopularGenre{ - Name: genre.Genre.String, + Name: name, Count: genre.Count, }) } + // Re-sort because merging variants can change the relative ordering, then + // drop anything below min_count (evaluated against the merged total). + sort.SliceStable(result, func(i, j int) bool { + return result[i].Count > result[j].Count + }) + + filtered := result[:0] + for _, g := range result { + if g.Count < int64(params.MinCount) { + continue + } + filtered = append(filtered, g) + } + return c.JSON(fiber.Map{ - "data": result, + "data": filtered, }) } diff --git a/api/v1_tracks_latest.go b/api/v1_tracks_latest.go index 95b5d37a..c65b96af 100644 --- a/api/v1_tracks_latest.go +++ b/api/v1_tracks_latest.go @@ -20,7 +20,7 @@ func (app *ApiServer) v1TracksLatest(c *fiber.Ctx) error { myId := app.getMyId(c) - trackIds, err := app.getLatestTrackIds(c, params.Genre, params.Limit, params.Offset) + trackIds, err := app.getLatestTrackIds(c, NormalizeGenre(params.Genre), params.Limit, params.Offset) if err != nil { return err } diff --git a/api/v1_tracks_trending.go b/api/v1_tracks_trending.go index 322137c4..7bc5cacf 100644 --- a/api/v1_tracks_trending.go +++ b/api/v1_tracks_trending.go @@ -23,7 +23,7 @@ func (app *ApiServer) v1TracksTrending(c *fiber.Ctx) error { trackIds, err := app.getTrendingIds( c, params.Time, - params.Genre, + NormalizeGenre(params.Genre), params.Limit, params.Offset, ) diff --git a/api/v1_tracks_trending_underground.go b/api/v1_tracks_trending_underground.go index 575e13b7..6f7e9b8d 100644 --- a/api/v1_tracks_trending_underground.go +++ b/api/v1_tracks_trending_underground.go @@ -65,7 +65,7 @@ func (app *ApiServer) v1TracksTrendingUnderground(c *fiber.Ctx) error { args["limit"] = params.Limit args["offset"] = params.Offset args["time"] = params.Time - args["genre"] = params.Genre + args["genre"] = NormalizeGenre(params.Genre) rows, err := app.pool.Query(c.Context(), sql, args) if err != nil { diff --git a/api/v1_users_genre_top.go b/api/v1_users_genre_top.go index 7bbc0c47..0f496808 100644 --- a/api/v1_users_genre_top.go +++ b/api/v1_users_genre_top.go @@ -28,7 +28,12 @@ func (app *ApiServer) v1UsersGenreTop(c *fiber.Ctx) error { OFFSET @offset ;` + genres := make([]string, len(query.Genres)) + for i, g := range query.Genres { + genres[i] = NormalizeGenre(g) + } + return app.queryUsers(c, sql, pgx.NamedArgs{ - "genres": query.Genres, + "genres": genres, }) } From 01ba2ba3c0d5f2fe761843e9063272a6c06d15ec Mon Sep 17 00:00:00 2001 From: Dylan Jeffers Date: Wed, 17 Jun 2026 17:47:31 -0700 Subject: [PATCH 2/3] fix: drop unsound genre filter-param normalization MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Normalizing the `genre` query param broke filtering: the tracks.genre column is written upstream (discovery provider) and is NOT normalized at rest, so canonicalizing the param no longer matches the raw stored value. CI surfaced this via TestGetLatestWithGenre — a query for "LatestTestGenreA" was title-cased to "Latesttestgenrea" and matched zero rows. Revert NormalizeGenre on the trending, underground, latest, and users/genre/top filter params. Keep normalization only on the /v1/genres/popular response, which is a display-layer transform that does not depend on matching stored values. Fully collapsing variants for filtering requires normalizing genre on write upstream. Co-Authored-By: Claude Opus 4.8 (1M context) --- api/v1_tracks_latest.go | 2 +- api/v1_tracks_trending.go | 2 +- api/v1_tracks_trending_underground.go | 2 +- api/v1_users_genre_top.go | 7 +------ 4 files changed, 4 insertions(+), 9 deletions(-) diff --git a/api/v1_tracks_latest.go b/api/v1_tracks_latest.go index c65b96af..95b5d37a 100644 --- a/api/v1_tracks_latest.go +++ b/api/v1_tracks_latest.go @@ -20,7 +20,7 @@ func (app *ApiServer) v1TracksLatest(c *fiber.Ctx) error { myId := app.getMyId(c) - trackIds, err := app.getLatestTrackIds(c, NormalizeGenre(params.Genre), params.Limit, params.Offset) + trackIds, err := app.getLatestTrackIds(c, params.Genre, params.Limit, params.Offset) if err != nil { return err } diff --git a/api/v1_tracks_trending.go b/api/v1_tracks_trending.go index 7bc5cacf..322137c4 100644 --- a/api/v1_tracks_trending.go +++ b/api/v1_tracks_trending.go @@ -23,7 +23,7 @@ func (app *ApiServer) v1TracksTrending(c *fiber.Ctx) error { trackIds, err := app.getTrendingIds( c, params.Time, - NormalizeGenre(params.Genre), + params.Genre, params.Limit, params.Offset, ) diff --git a/api/v1_tracks_trending_underground.go b/api/v1_tracks_trending_underground.go index 6f7e9b8d..575e13b7 100644 --- a/api/v1_tracks_trending_underground.go +++ b/api/v1_tracks_trending_underground.go @@ -65,7 +65,7 @@ func (app *ApiServer) v1TracksTrendingUnderground(c *fiber.Ctx) error { args["limit"] = params.Limit args["offset"] = params.Offset args["time"] = params.Time - args["genre"] = NormalizeGenre(params.Genre) + args["genre"] = params.Genre rows, err := app.pool.Query(c.Context(), sql, args) if err != nil { diff --git a/api/v1_users_genre_top.go b/api/v1_users_genre_top.go index 0f496808..7bbc0c47 100644 --- a/api/v1_users_genre_top.go +++ b/api/v1_users_genre_top.go @@ -28,12 +28,7 @@ func (app *ApiServer) v1UsersGenreTop(c *fiber.Ctx) error { OFFSET @offset ;` - genres := make([]string, len(query.Genres)) - for i, g := range query.Genres { - genres[i] = NormalizeGenre(g) - } - return app.queryUsers(c, sql, pgx.NamedArgs{ - "genres": genres, + "genres": query.Genres, }) } From 76a81be11518efea7df76db7a812e1b9dc0992f6 Mon Sep 17 00:00:00 2001 From: Dylan Jeffers Date: Wed, 17 Jun 2026 18:03:51 -0700 Subject: [PATCH 3/3] refactor: align canonical genre forms with go-openaudio GenreAllowlist NormalizeGenre now emits the protocol's canonical genre spellings as defined by go-openaudio's GenreAllowlist (the upstream ETL genre write path), so the API's normalized output matches what the indexer treats as canonical: - "Hip Hop" -> "Hip-Hop/Rap" - "R&B" -> "R&B/Soul" - Drum & Bass, Lo-Fi unchanged (already allowlist forms) Drop the speculative K-Pop/J-Pop entries that are not in the allowlist; they now fall through to generic title-casing. EDM/DJ are kept purely as acronym casing fixes (not allowlist genres). Test assertions updated to the new canonical forms. Co-Authored-By: Claude Opus 4.8 (1M context) --- api/genre_normalize.go | 45 ++++++++++++++++++++++++------------- api/genre_normalize_test.go | 29 +++++++++++++----------- 2 files changed, 45 insertions(+), 29 deletions(-) diff --git a/api/genre_normalize.go b/api/genre_normalize.go index 4d47ec4a..885f8cf4 100644 --- a/api/genre_normalize.go +++ b/api/genre_normalize.go @@ -7,25 +7,36 @@ import ( // canonicalGenres maps a collapsed lookup key (see genreKey) to the canonical // display form for genres that must not be plain title-cased — either because -// they merge separator variants ("hip-hop"/"hiphop" -> "Hip Hop") or because -// their conventional casing is not title case (R&B, EDM, DJ, ...). +// they merge separator variants ("hip-hop"/"hiphop" -> "Hip-Hop/Rap") or +// because their conventional casing is not title case (R&B/Soul, EDM, ...). +// +// The canonical output forms mirror go-openaudio's GenreAllowlist +// (pkg/etl/processors/entity_manager/genre_allowlist.go), which is the +// protocol's source of truth for canonical genre spelling. Keeping these in +// sync means the API's normalized output agrees with the form the upstream ETL +// indexer (the genre write path) treats as canonical. // // The key is the lowercased, alphanumeric-only form of the input, so every // punctuation/spacing variant of a genre maps through the same entry: -// "R&B", "r & b", "rnb" all collapse to key "rb"/"rnb" -> "R&B". +// "R&B", "r & b", "rnb", "R&B/Soul" all collapse to "R&B/Soul". var canonicalGenres = map[string]string{ - "hiphop": "Hip Hop", - "rb": "R&B", // "r&b", "r & b" - "rnb": "R&B", // "rnb" - "randb": "R&B", // "r and b" - "edm": "EDM", - "dj": "DJ", + // Allowlist genres whose canonical spelling differs from naive title case. + "hiphop": "Hip-Hop/Rap", // "Hip Hop", "hip-hop", "hiphop" + "hiphoprap": "Hip-Hop/Rap", // "Hip-Hop/Rap", "hip hop rap" + "rb": "R&B/Soul", // "r&b", "r & b" + "rnb": "R&B/Soul", // "rnb" + "randb": "R&B/Soul", // "r and b" + "rbsoul": "R&B/Soul", // "R&B/Soul" + "rnbsoul": "R&B/Soul", // "rnb/soul" "dnb": "Drum & Bass", "drumandbass": "Drum & Bass", - "drumbass": "Drum & Bass", + "drumbass": "Drum & Bass", // "Drum & Bass" itself "lofi": "Lo-Fi", - "kpop": "K-Pop", - "jpop": "J-Pop", + + // Acronyms not in the allowlist, kept only to preserve casing (so they are + // not title-cased to "Edm"/"Dj"). + "edm": "EDM", + "dj": "DJ", } // genreKey reduces a genre string to a comparison key: lowercased and stripped @@ -43,12 +54,14 @@ func genreKey(s string) string { // NormalizeGenre collapses genre variants to a single canonical form: // - trims surrounding whitespace and collapses internal whitespace runs -// - maps known special cases via canonicalGenres (R&B, EDM, Hip Hop, ...) +// - maps known special cases via canonicalGenres (R&B/Soul, EDM, +// Hip-Hop/Rap, ...) // - otherwise title-cases the value, preserving internal separators -// ("hip-hop/rap" -> "Hip-Hop/Rap") +// ("deep house" -> "Deep House") // -// Already-canonical values pass through unchanged (e.g. "Electronic", -// "R&B", "Hip Hop"). An empty/whitespace-only input returns "". +// Already-canonical allowlist values pass through unchanged (e.g. +// "Electronic", "R&B/Soul", "Hip-Hop/Rap"). An empty/whitespace-only input +// returns "". func NormalizeGenre(s string) string { trimmed := strings.TrimSpace(s) if trimmed == "" { diff --git a/api/genre_normalize_test.go b/api/genre_normalize_test.go index 50f3ecfe..29fef6e3 100644 --- a/api/genre_normalize_test.go +++ b/api/genre_normalize_test.go @@ -14,7 +14,7 @@ func TestGenreNormalize(t *testing.T) { }{ // trimming {"trims surrounding whitespace", " Electronic ", "Electronic"}, - {"collapses internal whitespace", "Hip Hop", "Hip Hop"}, + {"collapses internal whitespace", "Deep House", "Deep House"}, {"empty stays empty", "", ""}, {"whitespace-only stays empty", " ", ""}, @@ -25,15 +25,16 @@ func TestGenreNormalize(t *testing.T) { {"multi-word title case", "deep house", "Deep House"}, // hip-hop / hiphop variants collapse to "Hip Hop" - {"hyphenated hip-hop", "hip-hop", "Hip Hop"}, - {"squashed hiphop", "hiphop", "Hip Hop"}, - {"spaced hip hop lowercase", "hip hop", "Hip Hop"}, - {"uppercase HIP-HOP", "HIP-HOP", "Hip Hop"}, + // hip-hop / hiphop variants collapse to the allowlist form "Hip-Hop/Rap" + {"hyphenated hip-hop", "hip-hop", "Hip-Hop/Rap"}, + {"squashed hiphop", "hiphop", "Hip-Hop/Rap"}, + {"spaced hip hop lowercase", "hip hop", "Hip-Hop/Rap"}, + {"uppercase HIP-HOP", "HIP-HOP", "Hip-Hop/Rap"}, - // r&b variants collapse to "R&B" - {"r&b lowercase", "r&b", "R&B"}, - {"rnb", "rnb", "R&B"}, - {"r & b spaced", "R & B", "R&B"}, + // r&b variants collapse to the allowlist form "R&B/Soul" + {"r&b lowercase", "r&b", "R&B/Soul"}, + {"rnb", "rnb", "R&B/Soul"}, + {"r & b spaced", "R & B", "R&B/Soul"}, // other special cases keep conventional casing {"edm", "edm", "EDM"}, @@ -41,12 +42,14 @@ func TestGenreNormalize(t *testing.T) { {"drum and bass", "drum and bass", "Drum & Bass"}, {"dnb", "dnb", "Drum & Bass"}, - // already-correct values pass through unchanged + // already-canonical allowlist values pass through unchanged {"Electronic unchanged", "Electronic", "Electronic"}, - {"R&B unchanged", "R&B", "R&B"}, - {"Hip Hop unchanged", "Hip Hop", "Hip Hop"}, + {"Hip-Hop/Rap unchanged", "Hip-Hop/Rap", "Hip-Hop/Rap"}, + {"R&B/Soul unchanged", "R&B/Soul", "R&B/Soul"}, + {"Drum & Bass unchanged", "Drum & Bass", "Drum & Bass"}, + {"Lo-Fi unchanged", "Lo-Fi", "Lo-Fi"}, + {"Deep House unchanged", "Deep House", "Deep House"}, {"EDM unchanged", "EDM", "EDM"}, - {"hyphen/slash genre preserved", "Hip-Hop/Rap", "Hip-Hop/Rap"}, } for _, tc := range cases {