Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions api/genre_normalize.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
package api

import (
"strings"
"unicode"
)

// canonicalGenres maps a collapsed lookup key (see genreKey) to the canonical
// display form for genres that must not be plain title-cased — either because
// they merge separator variants ("hip-hop"/"hiphop" -> "Hip-Hop/Rap") or
// because their conventional casing is not title case (R&B/Soul, EDM, ...).
//
// The canonical output forms mirror go-openaudio's GenreAllowlist
// (pkg/etl/processors/entity_manager/genre_allowlist.go), which is the
// protocol's source of truth for canonical genre spelling. Keeping these in
// sync means the API's normalized output agrees with the form the upstream ETL
// indexer (the genre write path) treats as canonical.
//
// The key is the lowercased, alphanumeric-only form of the input, so every
// punctuation/spacing variant of a genre maps through the same entry:
// "R&B", "r & b", "rnb", "R&B/Soul" all collapse to "R&B/Soul".
var canonicalGenres = map[string]string{
// Allowlist genres whose canonical spelling differs from naive title case.
"hiphop": "Hip-Hop/Rap", // "Hip Hop", "hip-hop", "hiphop"
"hiphoprap": "Hip-Hop/Rap", // "Hip-Hop/Rap", "hip hop rap"
"rb": "R&B/Soul", // "r&b", "r & b"
"rnb": "R&B/Soul", // "rnb"
"randb": "R&B/Soul", // "r and b"
"rbsoul": "R&B/Soul", // "R&B/Soul"
"rnbsoul": "R&B/Soul", // "rnb/soul"
"dnb": "Drum & Bass",
"drumandbass": "Drum & Bass",
"drumbass": "Drum & Bass", // "Drum & Bass" itself
"lofi": "Lo-Fi",

// Acronyms not in the allowlist, kept only to preserve casing (so they are
// not title-cased to "Edm"/"Dj").
"edm": "EDM",
"dj": "DJ",
}

// genreKey reduces a genre string to a comparison key: lowercased and stripped
// of every non-alphanumeric rune. This is what makes "Hip Hop", "hip-hop", and
// "hiphop" indistinguishable, so variants collapse to a single canonical form.
func genreKey(s string) string {
var b strings.Builder
for _, r := range strings.ToLower(s) {
if unicode.IsLetter(r) || unicode.IsDigit(r) {
b.WriteRune(r)
}
}
return b.String()
}

// NormalizeGenre collapses genre variants to a single canonical form:
// - trims surrounding whitespace and collapses internal whitespace runs
// - maps known special cases via canonicalGenres (R&B/Soul, EDM,
// Hip-Hop/Rap, ...)
// - otherwise title-cases the value, preserving internal separators
// ("deep house" -> "Deep House")
//
// Already-canonical allowlist values pass through unchanged (e.g.
// "Electronic", "R&B/Soul", "Hip-Hop/Rap"). An empty/whitespace-only input
// returns "".
func NormalizeGenre(s string) string {
trimmed := strings.TrimSpace(s)
if trimmed == "" {
return ""
}
if canon, ok := canonicalGenres[genreKey(trimmed)]; ok {
return canon
}
return titleCaseGenre(trimmed)
}

// titleCaseGenre upper-cases the first letter of each alphabetic run and
// lower-cases the rest, leaving non-letter separators in place. Internal
// whitespace runs are collapsed to a single space. So "ELECTRONIC" ->
// "Electronic", "deep house" -> "Deep House", "hip-hop/rap" -> "Hip-Hop/Rap".
func titleCaseGenre(s string) string {
s = strings.Join(strings.Fields(s), " ")
var b strings.Builder
prevLetter := false
for _, r := range s {
if unicode.IsLetter(r) {
if prevLetter {
b.WriteRune(unicode.ToLower(r))
} else {
b.WriteRune(unicode.ToUpper(r))
}
prevLetter = true
} else {
b.WriteRune(r)
prevLetter = false
}
}
return b.String()
}
60 changes: 60 additions & 0 deletions api/genre_normalize_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
package api

import (
"testing"

"github.com/stretchr/testify/assert"
)

func TestGenreNormalize(t *testing.T) {
cases := []struct {
name string
in string
want string
}{
// trimming
{"trims surrounding whitespace", " Electronic ", "Electronic"},
{"collapses internal whitespace", "Deep House", "Deep House"},
{"empty stays empty", "", ""},
{"whitespace-only stays empty", " ", ""},

// casing
{"lowercases to title case", "electronic", "Electronic"},
{"uppercases to title case", "ELECTRONIC", "Electronic"},
{"mixed case to title case", "eLeCtRoNiC", "Electronic"},
{"multi-word title case", "deep house", "Deep House"},

// hip-hop / hiphop variants collapse to "Hip Hop"
// hip-hop / hiphop variants collapse to the allowlist form "Hip-Hop/Rap"
{"hyphenated hip-hop", "hip-hop", "Hip-Hop/Rap"},
{"squashed hiphop", "hiphop", "Hip-Hop/Rap"},
{"spaced hip hop lowercase", "hip hop", "Hip-Hop/Rap"},
{"uppercase HIP-HOP", "HIP-HOP", "Hip-Hop/Rap"},

// r&b variants collapse to the allowlist form "R&B/Soul"
{"r&b lowercase", "r&b", "R&B/Soul"},
{"rnb", "rnb", "R&B/Soul"},
{"r & b spaced", "R & B", "R&B/Soul"},

// other special cases keep conventional casing
{"edm", "edm", "EDM"},
{"dj", "DJ", "DJ"},
{"drum and bass", "drum and bass", "Drum & Bass"},
{"dnb", "dnb", "Drum & Bass"},

// already-canonical allowlist values pass through unchanged
{"Electronic unchanged", "Electronic", "Electronic"},
{"Hip-Hop/Rap unchanged", "Hip-Hop/Rap", "Hip-Hop/Rap"},
{"R&B/Soul unchanged", "R&B/Soul", "R&B/Soul"},
{"Drum & Bass unchanged", "Drum & Bass", "Drum & Bass"},
{"Lo-Fi unchanged", "Lo-Fi", "Lo-Fi"},
{"Deep House unchanged", "Deep House", "Deep House"},
{"EDM unchanged", "EDM", "EDM"},
}

for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
assert.Equal(t, tc.want, NormalizeGenre(tc.in), "NormalizeGenre(%q)", tc.in)
})
}
}
35 changes: 32 additions & 3 deletions api/v1_genres_popular.go
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
package api

import (
"sort"
"time"

"api.audius.co/api/dbv1"
Expand Down Expand Up @@ -36,19 +37,47 @@ func (app *ApiServer) v1GenresPopular(c *fiber.Ctx) error {
return err
}

// Genre values are written upstream (by the discovery provider) and are not
// normalized at rest, so the same logical genre can appear under several
// spellings ("Hip Hop", "hip-hop", "hiphop"). Collapse those variants to a
// canonical name here and sum their counts. NOTE: the SQL groups + paginates
// on the raw genre, so this only merges variants that land within the same
// page; the durable fix is to normalize genre on write upstream.
indexByName := make(map[string]int, len(genres))
result := make([]PopularGenre, 0, len(genres))
for _, genre := range genres {
if genre.Count < int64(params.MinCount) {
name := NormalizeGenre(genre.Genre.String)
if name == "" {
continue
}

if i, ok := indexByName[name]; ok {
result[i].Count += genre.Count
continue
}

indexByName[name] = len(result)
result = append(result, PopularGenre{
Name: genre.Genre.String,
Name: name,
Count: genre.Count,
})
}

// Re-sort because merging variants can change the relative ordering, then
// drop anything below min_count (evaluated against the merged total).
sort.SliceStable(result, func(i, j int) bool {
return result[i].Count > result[j].Count
})

filtered := result[:0]
for _, g := range result {
if g.Count < int64(params.MinCount) {
continue
}
filtered = append(filtered, g)
}

return c.JSON(fiber.Map{
"data": result,
"data": filtered,
})
}
Loading