dscnn

`birdnet_stm32.models.dscnn` ¶

DS-CNN (depthwise-separable CNN) model architecture for audio classification.

The model consists of: - An AudioFrontendLayer (from frontend.py) for feature extraction. - A stem convolution to lift channels. - Four stages of depthwise-separable or inverted-residual blocks with stride-2 downsampling. - Optional squeeze-and-excite (SE) channel attention per block. - Global average pooling (or attention pooling), dropout, and a dense classifier head.

Scaling is controlled via alpha (width multiplier) and depth_multiplier (block repeats). All channel counts are aligned to multiples of 8 for NPU vectorization.

`ds_conv_block(x, out_ch, stride_f=1, stride_t=1, name='ds', weight_decay=0.0001, drop_rate=0.1)` ¶

Depthwise-separable block (3x3 DW + 1x1 PW) with optional residual.

Parameters:

Name	Type	Description	Default
`x`	`Tensor`	Input tensor [B, H, W, C].	required
`out_ch`	`int`	Output channels for pointwise conv.	required
`stride_f`	`int`	Stride along frequency axis.	`1`
`stride_t`	`int`	Stride along time axis.	`1`
`name`	`str`	Base name for layers.	`'ds'`
`weight_decay`	`float`	L2 regularization for DW/PW kernels.	`0.0001`
`drop_rate`	`float`	Spatial dropout rate after PW BN.	`0.1`

Returns:

Type	Description
`Tensor`	Output tensor [B, H', W', out_ch].

Source code in birdnet_stm32/models/dscnn.py

def ds_conv_block(
    x: tf.Tensor,
    out_ch: int,
    stride_f: int = 1,
    stride_t: int = 1,
    name: str = "ds",
    weight_decay: float = 1e-4,
    drop_rate: float = 0.1,
) -> tf.Tensor:
    """Depthwise-separable block (3x3 DW + 1x1 PW) with optional residual.

    Args:
        x: Input tensor [B, H, W, C].
        out_ch: Output channels for pointwise conv.
        stride_f: Stride along frequency axis.
        stride_t: Stride along time axis.
        name: Base name for layers.
        weight_decay: L2 regularization for DW/PW kernels.
        drop_rate: Spatial dropout rate after PW BN.

    Returns:
        Output tensor [B, H', W', out_ch].
    """
    reg = regularizers.l2(weight_decay) if weight_decay and weight_decay > 0 else None
    in_ch = x.shape[-1]

    y = layers.DepthwiseConv2D(
        kernel_size=(3, 3),
        strides=(stride_f, stride_t),
        padding="same",
        use_bias=False,
        depthwise_regularizer=reg,
        name=f"{name}_dw",
    )(x)
    y = layers.BatchNormalization(name=f"{name}_dw_bn")(y)
    y = layers.ReLU(max_value=6, name=f"{name}_dw_relu")(y)

    y = layers.Conv2D(
        filters=out_ch,
        kernel_size=(1, 1),
        strides=(1, 1),
        padding="same",
        use_bias=False,
        kernel_regularizer=reg,
        name=f"{name}_pw",
    )(y)
    y = layers.BatchNormalization(name=f"{name}_pw_bn")(y)

    if drop_rate and drop_rate > 0:
        y = layers.SpatialDropout2D(drop_rate, name=f"{name}_drop")(y)

    # Residual connection when dimensions match
    if (stride_f == 1 and stride_t == 1) and (in_ch is not None and int(in_ch) == int(out_ch)):
        y = layers.Add(name=f"{name}_add")([x, y])

    y = layers.ReLU(max_value=6, name=f"{name}_pw_relu")(y)
    return y

`build_dscnn_model(num_mels, spec_width, sample_rate, chunk_duration, embeddings_size, num_classes, audio_frontend='hybrid', alpha=1.0, depth_multiplier=1, fft_length=512, mag_scale='pwl', frontend_trainable=False, class_activation='softmax', dropout_rate=0.5, n_mfcc=20, weight_decay=0.0001, use_se=True, se_reduction=8, use_inverted_residual=True, expansion_factor=2, use_attention_pooling=False)` ¶

Build a DS-CNN model with a selectable audio frontend.

Parameters:

Name	Type	Description	Default
`num_mels`	`int`	Number of mel bins.	required
`spec_width`	`int`	Spectrogram width (frames).	required
`sample_rate`	`int`	Sampling rate (Hz).	required
`chunk_duration`	`int`	Chunk duration (seconds).	required
`embeddings_size`	`int`	Channels in the final embeddings layer.	required
`num_classes`	`int`	Number of output classes.	required
`audio_frontend`	`str`	'librosa' \| 'hybrid' \| 'raw' \| 'mfcc' \| 'log_mel' (deprecated: 'precomputed', 'tf').	`'hybrid'`
`alpha`	`float`	Width multiplier for the backbone.	`1.0`
`depth_multiplier`	`int`	Repeats multiplier for DS blocks per stage.	`1`
`fft_length`	`int`	FFT size for hybrid/librosa paths.	`512`
`mag_scale`	`str`	Magnitude scaling ('pcen' \| 'pwl' \| 'db' \| 'none').	`'pwl'`
`frontend_trainable`	`bool`	Make frontend sub-layers trainable.	`False`
`class_activation`	`str`	'softmax' or 'sigmoid' for the classifier head.	`'softmax'`
`dropout_rate`	`float`	Dropout rate before the classifier head.	`0.5`
`n_mfcc`	`int`	Number of MFCC coefficients (only used when audio_frontend='mfcc').	`20`
`weight_decay`	`float`	L2 regularization weight for DS-CNN blocks.	`0.0001`
`use_se`	`bool`	Add SE channel attention after each block.	`True`
`se_reduction`	`int`	SE channel reduction factor.	`8`
`use_inverted_residual`	`bool`	Use inverted residual blocks instead of DS blocks.	`True`
`expansion_factor`	`int`	Expansion factor for inverted residual hidden dim.	`2`
`use_attention_pooling`	`bool`	Use attention pooling instead of GAP.	`False`

Returns:

Type	Description
`Model`	Uncompiled DS-CNN Keras model.

Raises:

Type	Description
`ValueError`	If raw frontend exceeds STM32N6 input size limit (65536).

Source code in birdnet_stm32/models/dscnn.py

def build_dscnn_model(
    num_mels: int,
    spec_width: int,
    sample_rate: int,
    chunk_duration: int,
    embeddings_size: int,
    num_classes: int,
    audio_frontend: str = "hybrid",
    alpha: float = 1.0,
    depth_multiplier: int = 1,
    fft_length: int = 512,
    mag_scale: str = "pwl",
    frontend_trainable: bool = False,
    class_activation: str = "softmax",
    dropout_rate: float = 0.5,
    n_mfcc: int = 20,
    weight_decay: float = 1e-4,
    use_se: bool = True,
    se_reduction: int = 8,
    use_inverted_residual: bool = True,
    expansion_factor: int = 2,
    use_attention_pooling: bool = False,
) -> tf.keras.Model:
    """Build a DS-CNN model with a selectable audio frontend.

    Args:
        num_mels: Number of mel bins.
        spec_width: Spectrogram width (frames).
        sample_rate: Sampling rate (Hz).
        chunk_duration: Chunk duration (seconds).
        embeddings_size: Channels in the final embeddings layer.
        num_classes: Number of output classes.
        audio_frontend: 'librosa' | 'hybrid' | 'raw' | 'mfcc' | 'log_mel'
            (deprecated: 'precomputed', 'tf').
        alpha: Width multiplier for the backbone.
        depth_multiplier: Repeats multiplier for DS blocks per stage.
        fft_length: FFT size for hybrid/librosa paths.
        mag_scale: Magnitude scaling ('pcen' | 'pwl' | 'db' | 'none').
        frontend_trainable: Make frontend sub-layers trainable.
        class_activation: 'softmax' or 'sigmoid' for the classifier head.
        dropout_rate: Dropout rate before the classifier head.
        n_mfcc: Number of MFCC coefficients (only used when audio_frontend='mfcc').
        weight_decay: L2 regularization weight for DS-CNN blocks.
        use_se: Add SE channel attention after each block.
        se_reduction: SE channel reduction factor.
        use_inverted_residual: Use inverted residual blocks instead of DS blocks.
        expansion_factor: Expansion factor for inverted residual hidden dim.
        use_attention_pooling: Use attention pooling instead of GAP.

    Returns:
        Uncompiled DS-CNN Keras model.

    Raises:
        ValueError: If raw frontend exceeds STM32N6 input size limit (65536).
    """
    audio_frontend = normalize_frontend_name(audio_frontend)

    # Enforce STM32N6 constraint for raw frontend
    if audio_frontend == "raw":
        T = int(sample_rate * chunk_duration)
        if T >= (1 << 16):
            raise ValueError(
                f"STM32N6 constraint: raw input length (sample_rate*chunk_duration={T}) must be < 65536.\n"
                f"Use --sample_rate 16000, --chunk_duration 2, or --audio_frontend hybrid/librosa."
            )

    # Select input shape and frontend mode
    if audio_frontend in ("librosa", "mfcc", "log_mel"):
        input_bins = n_mfcc if audio_frontend == "mfcc" else num_mels
        inputs = tf.keras.Input(shape=(input_bins, spec_width, 1), name="mel_spectrogram_input")
        x = AudioFrontendLayer(
            mode="precomputed",
            mel_bins=input_bins,
            spec_width=spec_width,
            sample_rate=sample_rate,
            chunk_duration=chunk_duration,
            fft_length=fft_length,
            mag_scale=mag_scale if audio_frontend == "librosa" else "none",
            is_trainable=frontend_trainable,
            name="audio_frontend",
        )(inputs)
    elif audio_frontend == "hybrid":
        fft_bins = fft_length // 2 + 1
        inputs = tf.keras.Input(shape=(fft_bins, spec_width, 1), name="linear_spectrogram_input")
        x = AudioFrontendLayer(
            mode="hybrid",
            mel_bins=num_mels,
            spec_width=spec_width,
            sample_rate=sample_rate,
            chunk_duration=chunk_duration,
            fft_length=fft_length,
            mag_scale=mag_scale,
            is_trainable=frontend_trainable,
            name="audio_frontend",
        )(inputs)
    elif audio_frontend == "raw":
        inputs = tf.keras.Input(shape=(int(chunk_duration * sample_rate), 1), name="raw_audio_input")
        x = AudioFrontendLayer(
            mode="raw",
            mel_bins=num_mels,
            spec_width=spec_width,
            sample_rate=sample_rate,
            chunk_duration=chunk_duration,
            fft_length=fft_length,
            mag_scale=mag_scale,
            is_trainable=frontend_trainable,
            name="audio_frontend",
        )(inputs)
    else:
        raise ValueError(f"Invalid audio_frontend: {audio_frontend}")

    # Stem (3x3, stride 1x2) to lift channels
    stem_ch = _make_divisible(int(16 * alpha), 8)
    x = layers.Conv2D(stem_ch, (3, 3), strides=(1, 2), padding="same", use_bias=False, name="stem_conv")(x)
    x = layers.BatchNormalization(name="stem_bn")(x)
    x = layers.ReLU(max_value=6, name="stem_relu")(x)

    # Four stages: (base_filters, base_repeats, (stride_f, stride_t))
    base_filters = [32, 64, 128, 256]
    base_repeats = [2, 3, 4, 2]
    base_strides = [(2, 2), (2, 2), (2, 2), (2, 2)]

    for si, (bf, br, (sf, st)) in enumerate(zip(base_filters, base_repeats, base_strides, strict=True), start=1):
        out_ch = _make_divisible(int(bf * alpha), 8)
        reps = max(1, int(math.ceil(br * depth_multiplier)))

        if use_inverted_residual:
            x = inverted_residual_block(
                x,
                out_ch,
                expansion=expansion_factor,
                stride_f=sf,
                stride_t=st,
                use_se=use_se,
                se_reduction=se_reduction,
                weight_decay=weight_decay,
                name=f"stage{si}_ir1",
            )
            for bi in range(2, reps + 1):
                x = inverted_residual_block(
                    x,
                    out_ch,
                    expansion=expansion_factor,
                    stride_f=1,
                    stride_t=1,
                    use_se=use_se,
                    se_reduction=se_reduction,
                    weight_decay=weight_decay,
                    name=f"stage{si}_ir{bi}",
                )
        else:
            x = ds_conv_block(x, out_ch, stride_f=sf, stride_t=st, name=f"stage{si}_ds1", weight_decay=weight_decay)
            if use_se:
                x = se_block(x, reduction=se_reduction, name=f"stage{si}_se1")
            for bi in range(2, reps + 1):
                x = ds_conv_block(
                    x, out_ch, stride_f=1, stride_t=1, name=f"stage{si}_ds{bi}", weight_decay=weight_decay
                )
                if use_se:
                    x = se_block(x, reduction=se_reduction, name=f"stage{si}_se{bi}")

    # Final 1x1 conv to embeddings
    emb_ch = _make_divisible(int(embeddings_size), 8)
    if not (x.shape[-1] is not None and int(x.shape[-1]) == int(emb_ch)):
        x = layers.Conv2D(emb_ch, (1, 1), strides=(1, 1), padding="same", use_bias=False, name="emb_conv")(x)
        x = layers.BatchNormalization(name="emb_bn")(x)
        x = layers.ReLU(max_value=6, name="emb_relu")(x)

    # Head
    if use_attention_pooling:
        x = attention_pooling(x, name="attn_pool")
    else:
        x = layers.GlobalAveragePooling2D(name="gap")(x)
    x = layers.Dropout(dropout_rate, name="dropout")(x)
    outputs = layers.Dense(num_classes, activation=class_activation, name="pred")(x)
    return tf.keras.models.Model(inputs, outputs, name="dscnn_audio")

dscnn

birdnet_stm32.models.dscnn ¶

ds_conv_block(x, out_ch, stride_f=1, stride_t=1, name='ds', weight_decay=0.0001, drop_rate=0.1) ¶

`birdnet_stm32.models.dscnn` ¶

`ds_conv_block(x, out_ch, stride_f=1, stride_t=1, name='ds', weight_decay=0.0001, drop_rate=0.1)` ¶