I'm doing some work where I need "natural sort" - eg, "Foo 2" comes before "Foo 10". In application code (Elixir), this is not hard: use a regex to split a string into numeric and text chunks, and create an array/list of strings and numbers. It sorts as expected.
@doc """
Generates a natural sort key for a string so that (for example) strings sort
as "foo 1", "foo 2", "foo 10", instead of "foo 1", "foo 10", "foo 2".
It does this by splitting the string into a list of lowercase strings and
numbers.
Note that numbers sort before strings: in Elixir/Erlang: `number < atom <
reference < function < port < pid < tuple < map < list < bitstring`
Examples:
iex> natural_sort_key("foo 1")
["foo ", 1]
iex> natural_sort_key("FOO 10")
["foo ", 10]
iex> natural_sort_key("foo 1.3")
["foo ", 1.3]
iex> natural_sort_key("🌟 Café 1.3")
["🌟 café ", 1.3]
iex> natural_sort_key("2 foo 1.3 bar 2")
[2, " foo ", 1.3, " bar ", 2]
Sorting examples:
iex> Enum.sort_by(
...> ["1.1B", "1.2A", "1A", "10A", "1B", "1.1A"],
...> &natural_sort_key/1
...> )
["1A", "1B", "1.1A", "1.1B", "1.2A", "10A"]
iex> Enum.sort_by(
...> [%{name: "foo1"}, %{name: "foo10"}, %{name: "foo2"}],
...> fn foo -> natural_sort_key(foo.name) end
...> )
[%{name: "foo1"}, %{name: "foo2"}, %{name: "foo10"}]
iex> Enum.sort_by(
...> ["1", "A", "2", "B"],
...> fn val -> natural_sort_key(val) end
...> )
["1", "2", "A", "B"]
"""
@spec natural_sort_key(String.t()) :: [String.t() | integer() | float()]
def natural_sort_key(string) when is_binary(string) do
# Use named captures to identify floats, integers, and text parts
~r/(?<float>\d+\.\d+)|(?<integer>\d+)|(?<text>[^\d]+)/
|> Regex.scan(string, capture: :all_names)
|> Enum.map(fn
["", integer, ""] when integer != "" -> String.to_integer(integer)
[float, "", ""] when float != "" -> String.to_float(float)
["", "", text] when text != "" -> String.downcase(text)
end)
end
Getting PostgreSQL to sort the same way proved difficult. I tried "numeric collation" as shown in this answer, but it didn't handle all the cases I expected.
Finally I (with the help of AI, a test suite, and a lot of patience) came up with this complex, expensive, but working function, which allows doing ORDER BY natural_sort_key(any_textual_column).
-- Allow doing `ORDER BY natural_sort_key(any_textual_column)`
-- such that it sorts numbers naturally (e.g., "HVAC2" before "HVAC10").
-- Works on text, citext, varchar, char columns.
CREATE FUNCTION "natural_sort_key"(input_text text) RETURNS jsonb AS $$
BEGIN
RETURN (
WITH components AS (
SELECT jsonb_agg(
CASE
-- Split the text into chunks of contiguous numbers and text.
-- We need jsonb arrays, not normal arrays, to support a mix of numbers and text.
--
-- In each case, we wrap the value in an array and add a leading 0 or 1.
-- This is because JSONB natively sorts numbers after strings, and we need
-- to overrule that to be consistent with how sorting works elsewhere.
--
-- We also need to pad the json array because shorter arrays are
-- sorted before longer ones.
WHEN m[1] ~ '^\d+\.\d+$' THEN jsonb_build_array(0, m[1]::numeric)
WHEN m[1] ~ '^\d+$' THEN jsonb_build_array(0, m[1]::numeric)
ELSE jsonb_build_array(1, lower(m[1]))
END
) as components_array
FROM regexp_matches(input_text, '\d+\.\d+|\d+|[a-zA-Z]+', 'g') m
)
SELECT components_array || (
-- To ensure that the number of "chunks" doesn't affect the sorting, ensure
-- that we always have 10 of them.
-- (This would break if the input had more than 10 chunks, like
-- 1A3A5A7D9E11, but that's very unlikely.)
-- Pad with [2, ''] so that these "blank" chunks sort after all actual numbers or text.
SELECT jsonb_agg(jsonb_build_array(2, ''))
FROM generate_series(1, GREATEST(0, 10 - jsonb_array_length(components_array)))
)
FROM components
);
END;
$$ LANGUAGE plpgsql IMMUTABLE;
Here are the cases where I confirmed that it does the right thing:
-- Test Suite: All these cases must sort in the exact order shown
-- Test 1: Purely numeric values
WITH test_purely_numeric AS (
SELECT unnest(ARRAY['100', '10', '2', '1']) as name
)
SELECT 'Test 1: Purely Numeric' as test_name,
string_agg(name, ', ' ORDER BY natural_sort_key(name)) as sorted_result,
'1, 2, 10, 100' as expected
FROM test_purely_numeric;
-- Test 2: Purely alphabetic values
WITH test_purely_alpha AS (
SELECT unnest(ARRAY['Gamma', 'Alpha', 'Beta']) as name
)
SELECT 'Test 2: Purely Alphabetic' as test_name,
string_agg(name, ', ' ORDER BY natural_sort_key(name)) as sorted_result,
'Alpha, Beta, Gamma' as expected
FROM test_purely_alpha;
-- Test 3: Mixed alphanumeric values
WITH test_mixed_alpha AS (
SELECT unnest(ARRAY['HVAC B1', 'HVAC A10', 'HVAC 1', 'HVAC A1', 'HVAC 2', 'HVAC A2']) as name
)
SELECT 'Test 3a: Mixed Alphanumeric' as test_name,
string_agg(name, ', ' ORDER BY natural_sort_key(name)) as sorted_result,
'HVAC 1, HVAC 2, HVAC A1, HVAC A2, HVAC A10, HVAC B1' as expected
FROM test_mixed_alpha;
WITH test_mixed_alpha2 AS (
SELECT unnest(ARRAY['5', '4', '1.1A', '1A']) as name
)
SELECT 'Test 3b: Mixed Alphanumeric' as test_name,
string_agg(name, ', ' ORDER BY natural_sort_key(name)) as sorted_result,
'1A, 1.1A, 4, 5' as expected
FROM test_mixed_alpha2;
-- Test 4: Numbers before letters
WITH test_numbers_first AS (
SELECT unnest(ARRAY['B', 'A', '2', '1']) as name
)
SELECT 'Test 4: Numbers Before Letters' as test_name,
string_agg(name, ', ' ORDER BY natural_sort_key(name)) as sorted_result,
'1, 2, A, B' as expected
FROM test_numbers_first;
-- Test 5: Multiple numbers in string
WITH test_multiple_numbers AS (
SELECT unnest(ARRAY['Room 10 Unit 1', 'Room 2 Unit 1', 'Room 1 Unit 10', 'Room 1 Unit 2']) as name
)
SELECT 'Test 5: Multiple Numbers' as test_name,
string_agg(name, ', ' ORDER BY natural_sort_key(name)) as sorted_result,
'Room 1 Unit 2, Room 1 Unit 10, Room 2 Unit 1, Room 10 Unit 1' as expected
FROM test_multiple_numbers;
-- Test 6: Leading zeros
WITH test_leading_zeros AS (
SELECT unnest(ARRAY['Unit 10', 'Unit 02', 'Unit 01']) as name
)
SELECT 'Test 6: Leading Zeros' as test_name,
string_agg(name, ', ' ORDER BY natural_sort_key(name)) as sorted_result,
'Unit 01, Unit 02, Unit 10' as expected
FROM test_leading_zeros;
-- Test 7: Case insensitive
WITH test_case_insensitive AS (
SELECT unnest(ARRAY['unit 11', 'UNIT 10', 'Unit 3', 'unit 2', 'Unit 1']) as name
)
SELECT 'Test 7: Case Insensitive' as test_name,
string_agg(name, ', ' ORDER BY natural_sort_key(name)) as sorted_result,
'Unit 1, unit 2, Unit 3, UNIT 10, unit 11' as expected
FROM test_case_insensitive;
-- Test 8: Decimal handling (the complex one!)
WITH test_decimals AS (
SELECT unnest(ARRAY[
'B1.1', 'B1', 'A10', 'A1.2', 'A1.1', 'A1', '10A', '1.2A',
'1.1B', '1.1A', '1.01A', '1B', '1A'
]) as name
)
SELECT 'Test 8: Decimals' as test_name,
string_agg(name, ', ' ORDER BY natural_sort_key(name)) as sorted_result,
'1A, 1B, 1.01A, 1.1A, 1.1B, 1.2A, 10A, A1, A1.1, A1.2, A10, B1, B1.1' as expected
FROM test_decimals;
I made a fiddle where you can play with the function and the test cases here: https://www.db-fiddle.com/f/pEYUN1z2ickWZAq1iDM3Y/6
There has to be a simpler, cheaper way, right?
You can use COLLATION that supports natural sort:
CREATE COLLATION natural_sort (provider = icu, locale = 'en@colNumeric=yes');
Query 1:
WITH test_mixed_alpha AS (
SELECT unnest(ARRAY['HVAC B1','HVAC A10','HVAC 1','HVAC A1','HVAC 2','HVAC A2']) name
)
SELECT array_agg(name ORDER BY name COLLATE natural_sort) AS sorted,
'HVAC 1, HVAC 2, HVAC A1, HVAC A2, HVAC A10, HVAC B1' AS expected
FROM test_mixed_alpha;
Query 2:
WITH test_mixed_alpha2 AS (
SELECT unnest(ARRAY['5', '4', '1.1A', '1A']) as name
)
SELECT array_agg(name ORDER BY name COLLATE natural_sort) AS sorted,
'1A, 1.1A, 4, 5' as expected
FROM test_mixed_alpha2;
db<>fiddle demo
If you love us? You can donate to us via Paypal or buy me a coffee so we can maintain and grow! Thank you!
Donate Us With