Skip to content
Merged
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@

**Text-to-text alignment algorithm for speech recognition error analysis.** ErrorAlign helps you dig deeper into your speech recognition projects by accurately aligning each word in a reference transcript with the model-generated transcript. Unlike traditional methods, such as Levenshtein-based alignment, it is not restricted to simple one-to-one alignment, but can map a single reference word to multiple words or subwords in the model output. This enables quick and reliable identification of error patterns in rare words, names, or domain-specific terms that matter most for your application.

→ **Update [2025-12-10]:** As of version `0.1.0b5`, `error-align` will include a word-level pass to efficiently identify unambiguous matches, along with C++ extensions to accelerate beam search and backtrace construction. The combined speedup is ~15× over the pure-Python implementation ⚡
→ **Update [2026-06-22]:** As of version `0.1.0b10`, the word-level pass defaults to a faster `rapidfuzz`-based method that anchors matches from a single optimal Levenshtein alignment. On longer examples (e.g., Earnings-21), the speedup is expected to be around 30×. The original graph-based pass is still available via `error_align(ref, hyp, word_level_method="unambiguous")`.

→ **Update [2025-12-10]:** As of version `0.1.0b5`, `error-align` will include a word-level pass to efficiently identify unambiguous matches, along with C++ extensions to accelerate beam search and backtrace construction. The combined speedup is ~15× over the pure-Python implementation.

[//]: <> (https://raw.githubusercontent.com/corticph/error-align/refs/heads/main/.github/assets/logo_gpt.svg)

Expand Down
14 changes: 10 additions & 4 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ build-backend = "scikit_build_core.build"

[project]
name = "error-align"
version = "0.1.0b9"
version = "0.1.0b10"
description = "Text-to-text alignment algorithm for speech recognition error analysis."
readme = "README.md"
requires-python = ">=3.10,<3.15"
Expand All @@ -36,14 +36,14 @@ dependencies = [
"tqdm>=4.67.1",
"unidecode>=1.4.0",
"regex>=2025.9.18",
"rapidfuzz>=3.13.0",
]

[project.urls]
Homepage = "https://github.com/corticph/error-align"

[project.optional-dependencies]
evaluation = [
"rapidfuzz>=3.13.0; python_version == '3.12'",
"num2words>=0.5.14; python_version == '3.12'",
"datasets>=3.3.2; python_version == '3.12'",
"soundfile>=0.13.1; python_version == '3.12'",
Expand Down
52 changes: 48 additions & 4 deletions src/error_align/error_align.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from rapidfuzz.distance import Levenshtein

from error_align.backtrace_graph import BacktraceGraph
from error_align.core import compute_levenshtein_distance_matrix, error_align_beam_search
from error_align.graph_metadata import GraphMetadata, SubgraphMetadata
Expand All @@ -19,6 +21,7 @@ def error_align(
normalizer: callable = basic_normalizer,
beam_size: int = 100,
word_level_pass: bool = True,
word_level_method: str = "rapidfuzz",
):
"""Run error alignment between reference and hypothesis texts.

Expand All @@ -28,7 +31,10 @@ def error_align(
tokenizer (callable): A function to tokenize the sequences. Must be regex-based and return Match objects.
normalizer (callable): A function to normalize the tokens. Defaults to basic_normalizer.
beam_size (int): The beam size for beam search alignment.
word_level_pass (bool): Whether to perform a word-level alignment pass to identify unambiguous matches.
word_level_pass (bool): Whether to perform a word-level alignment pass to anchor matches before beam search.
word_level_method (str): Which word-level pass to use when ``word_level_pass`` is True. ``"rapidfuzz"``
(default) takes the matches from a single optimal Levenshtein alignment via rapidfuzz. ``"unambiguous"``
builds the full backtrace graph and only anchors matches common to all optimal paths.

"""
graph_metadata = prepare_graph_metadata(
Expand All @@ -42,8 +48,12 @@ def error_align(
return align_identical_inputs(graph_metadata)
elif not word_level_pass:
return align_beam_search(graph_metadata, beam_size=beam_size)
else:
elif word_level_method == "rapidfuzz":
return align_with_rapidfuzz_word_level_pass(graph_metadata, beam_size=beam_size)
elif word_level_method == "unambiguous":
return align_with_word_level_pass(graph_metadata, beam_size=beam_size)
else:
raise ValueError(f"Unknown word_level_method: {word_level_method!r}")


def prepare_graph_metadata(
Expand Down Expand Up @@ -127,10 +137,44 @@ def align_with_word_level_pass(
)
backtrace_graph = BacktraceGraph(backtrace_matrix)
match_indices = backtrace_graph.get_unambiguous_node_matches()
return align_from_match_indices(graph_metadata, beam_size, match_indices)


def align_with_rapidfuzz_word_level_pass(
graph_metadata: GraphMetadata,
beam_size: int,
) -> list[Alignment]:
"""Perform a word-level alignment pass using matches from a single optimal Levenshtein alignment."""
match_indices = get_rapidfuzz_match_indices(graph_metadata.ref_norm, graph_metadata.hyp_norm)
return align_from_match_indices(graph_metadata, beam_size, match_indices)


def get_rapidfuzz_match_indices(ref_norm: list[str], hyp_norm: list[str]) -> list[tuple[int, int]]:
"""Infer word-level match indices from a rapidfuzz Levenshtein alignment.

rapidfuzz only emits the non-match operations (insert/delete/replace), so matches are recovered as the
complement of the edited token indices. Returns ``(hyp_idx, ref_idx)`` tuples to match the convention used
by ``BacktraceGraph.get_unambiguous_node_matches`` and consumed by ``align_from_match_indices``.
"""
edit_ops = Levenshtein.editops(ref_norm, hyp_norm).as_list() # (op, ref_idx, hyp_idx)
ref_edit_idxs = {op[1] for op in edit_ops if op[0] != "insert"}
hyp_edit_idxs = {op[2] for op in edit_ops if op[0] != "delete"}
ref_match = [i for i in range(len(ref_norm)) if i not in ref_edit_idxs]
hyp_match = [i for i in range(len(hyp_norm)) if i not in hyp_edit_idxs]
# Matches are monotonic in both axes; zip the complements and swap to (hyp, ref).
return [(h, r) for r, h in zip(ref_match, hyp_match, strict=True)]


def align_from_match_indices(
graph_metadata: GraphMetadata,
beam_size: int,
match_indices: list[tuple[int, int]],
) -> list[Alignment]:
"""Extract alignments from word-level match anchors, beam-searching the ambiguous spans between them."""
# NOTE: We always add an artificial terminal match node to simplify subspan extraction.
match_indices = match_indices + [(len(graph_metadata.hyp_norm), len(graph_metadata.ref_norm))]

# Iterate over the unambiguous matches to extract subspans (i.e., the span of words between two matches).
# Iterate over the matches to extract subspans (i.e., the span of words between two matches).
hyp_start, ref_start = (0, 0)
Comment thread
borgholt marked this conversation as resolved.
alignments = []
end_index = len(match_indices) - 1
Expand Down Expand Up @@ -174,7 +218,7 @@ def align_with_word_level_pass(
hyp_index=hyp_end,
)
)
ref_start, hyp_start = (ref_end + 1, hyp_end + 1)
hyp_start, ref_start = (hyp_end + 1, ref_end + 1)

return alignments

Expand Down
32 changes: 29 additions & 3 deletions tests/test_default.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import pytest
from error_align._cpp_beam_search import error_align_beam_search as cpp_error_align_beam_search
from typeguard import suppress_type_checks

Expand All @@ -6,7 +7,7 @@
from error_align.beam_search import _cpp_path_to_py_path
from error_align.beam_search import error_align_beam_search as python_error_align_beam_search
from error_align.edit_distance import compute_error_align_distance_matrix, compute_levenshtein_distance_matrix
from error_align.error_align import prepare_graph_metadata
from error_align.error_align import get_rapidfuzz_match_indices, prepare_graph_metadata
from error_align.graph_metadata import SubgraphMetadata
from error_align.utils import (
Alignment,
Expand All @@ -19,13 +20,14 @@
)


def test_error_align() -> None:
@pytest.mark.parametrize("word_level_method", ["rapidfuzz", "unambiguous"])
def test_error_align(word_level_method: str) -> None:
"""Test error alignment for an example including all substitution types."""

ref = "This is a substitution test deleted."
hyp = "Inserted this is a contribution test."

alignments = error_align(ref, hyp)
alignments = error_align(ref, hyp, word_level_method=word_level_method)
expected_ops = [
OpType.INSERT, # Inserted
OpType.MATCH, # This
Expand All @@ -40,6 +42,30 @@ def test_error_align() -> None:
assert alignment.op_type == op


@pytest.mark.parametrize(
("ref_norm", "hyp_norm", "expected"),
[
# Replace: equal lengths, only the differing token is excluded.
(["this", "is", "a", "test"], ["this", "is", "a", "pest"], [(0, 0), (1, 1), (2, 2)]),
# Insertion: hyp longer; "b" stays matched at (hyp=2, ref=1).
(["a", "b"], ["a", "x", "b"], [(0, 0), (2, 1)]),
# Deletion: ref longer; "c" stays matched at (hyp=1, ref=2).
(["a", "b", "c"], ["a", "c"], [(0, 0), (1, 2)]),
],
)
def test_get_rapidfuzz_match_indices(ref_norm: list[str], hyp_norm: list[str], expected: list[tuple[int, int]]) -> None:
"""Match indices are inferred as the complement of rapidfuzz edit ops, in (hyp_idx, ref_idx) order."""

assert get_rapidfuzz_match_indices(ref_norm, hyp_norm) == expected


def test_error_align_unknown_word_level_method() -> None:
"""An unknown word-level method raises a clear error."""

with pytest.raises(ValueError, match="Unknown word_level_method"):
error_align("a b c", "a x c", word_level_method="nonsense")


def test_beam_search_cpp_vs_python() -> None:
"""Test that the C++ and Python beam search implementations produce the same results."""

Expand Down
Loading