diff --git a/firmware/fpga/board.py b/firmware/fpga/board.py index fe610a54..9185f872 100644 --- a/firmware/fpga/board.py +++ b/firmware/fpga/board.py @@ -6,7 +6,7 @@ # SPDX-License-Identifier: BSD-3-Clause from amaranth import Elaboratable, Signal, Instance, Module, ClockDomain -from amaranth.build import Resource, Pins, PinsN, Clock, Attrs +from amaranth.build import Resource, Pins, Clock, Attrs from amaranth.vendor import LatticeICE40Platform from amaranth_boards.resources import SPIResource @@ -37,16 +37,14 @@ class PralinePlatform(LatticeICE40Platform): Attrs(IO_STANDARD="SB_LVCMOS")), Resource("host_data", 0, Pins("21 19 6 13 10 3 4 18", dir="io"), Attrs(IO_STANDARD="SB_LVCMOS")), + Resource("q_invert", 0, Pins("9", dir="i"), + Attrs(IO_STANDARD="SB_LVCMOS")), Resource("direction", 0, Pins("12", dir="i"), Attrs(IO_STANDARD="SB_LVCMOS")), Resource("disable", 0, Pins("23", dir="i"), Attrs(IO_STANDARD="SB_LVCMOS")), Resource("capture_en", 0, Pins("11", dir="o"), Attrs(IO_STANDARD="SB_LVCMOS")), - - # Other I/O. - Resource("q_invert", 0, Pins("9", dir="i"), - Attrs(IO_STANDARD="SB_LVCMOS")), Resource("trigger_in", 0, Pins("48", dir="i"), Attrs(IO_STANDARD="SB_LVCMOS")), Resource("trigger_out", 0, Pins("2", dir="o"), diff --git a/firmware/fpga/build/praline_fpga.bin b/firmware/fpga/build/praline_fpga.bin index 6a1a29ad..8279693b 100644 Binary files a/firmware/fpga/build/praline_fpga.bin and b/firmware/fpga/build/praline_fpga.bin differ diff --git a/firmware/fpga/dsp/fir_mac16.py b/firmware/fpga/dsp/fir_mac16.py index 9bb18c79..fea4824a 100644 --- a/firmware/fpga/dsp/fir_mac16.py +++ b/firmware/fpga/dsp/fir_mac16.py @@ -7,7 +7,7 @@ from math import ceil, log2 from amaranth import Module, Signal, Mux, DomainRenamer, ClockSignal, signed -from amaranth.lib import wiring, stream, data, memory, fifo +from amaranth.lib import wiring, stream, data, memory from amaranth.lib.wiring import In, Out from amaranth.utils import bits_for @@ -58,7 +58,7 @@ class HalfBandDecimatorMAC16(wiring.Component): if not self.input.signature.always_ready: m.d.comb += self.input.ready.eq(~odd | fir.input.ready) - m.d.comb += dly.output.ready.eq(fir.input.ready) + m.d.comb += dly.output.ready.eq(1) m.d.comb += [ dly.input.p.eq(self.input.p), @@ -126,52 +126,30 @@ class HalfBandInterpolatorMAC16(wiring.Component): taps = [ 2 * tap for tap in self.taps ] arm0_taps = taps[0::2] - arm1_taps = taps[1::2] - delay = arm1_taps.index(1) # Arms - m.submodules.fir = fir = FIRFilterMAC16(arm0_taps, shape=self.data_shape, shape_out=self.shape_out, overclock_rate=self.overclock_rate, always_ready=always_ready, num_channels=self.num_channels) - m.submodules.dly = dly = Delay(delay, shape=self.data_shape, always_ready=always_ready, num_channels=self.num_channels) - m.submodules.dly_fifo = dly_fifo = fifo.SyncFIFOBuffered(width=self.num_channels*self.data_shape.as_shape().width, depth=self.overclock_rate+1) - - m.d.comb += [ - dly_fifo.w_data.eq(dly.output.p), - dly_fifo.w_en.eq(dly.output.valid), - ] - if not dly.output.signature.always_ready: - m.d.comb += dly.output.ready.eq(dly_fifo.w_rdy) + m.submodules.fir = fir = FIRFilterMAC16(arm0_taps, shape=self.data_shape, shape_out=self.shape_out, overclock_rate=self.overclock_rate, always_ready=always_ready, num_channels=self.num_channels, delayed_port=True) - winchin_valid = Signal() - winchin_ready_0 = Signal() - winchin_ready = Signal() - m.d.comb += [ - winchin_valid.eq(self.input.valid), - winchin_ready.eq(self.input.ready), - winchin_ready_0.eq(fir.input.ready), - ] - - #busy = Signal() - #with m.If(fir.input.valid & fir.input.ready): - # m.d.sync += busy.eq(1) + busy = Signal() + with m.If(fir.input.valid & fir.input.ready): + m.d.sync += busy.eq(1) # Input m.d.comb += fir.input.payload.eq(self.input.payload) - m.d.comb += fir.input.valid.eq(self.input.valid & dly.input.ready) - m.d.comb += dly.input.payload.eq(self.input.payload) - m.d.comb += dly.input.valid.eq(self.input.valid & fir.input.ready) + m.d.comb += fir.input.valid.eq(self.input.valid & ~busy) if not self.input.signature.always_ready: - m.d.comb += self.input.ready.eq(fir.input.ready & dly.input.ready) + m.d.comb += self.input.ready.eq(fir.input.ready & ~busy) # Output # Arm index selection: switch after every delivered sample arm_index = Signal() - #delayed = Signal.like(fir.input_delayed) - #with m.If(fir.output.valid & fir.output.ready): - # m.d.sync += delayed.eq(fir.input_delayed) - r_data_cast = data.ArrayLayout(self.data_shape, self.num_channels)(dly_fifo.r_data) + delayed = Signal.like(fir.input_delayed) + with m.If(fir.output.valid & fir.output.ready): + m.d.sync += delayed.eq(fir.input_delayed) + with m.If(~self.output.valid | self.output.ready): with m.Switch(arm_index): @@ -185,11 +163,10 @@ class HalfBandInterpolatorMAC16(wiring.Component): m.d.sync += arm_index.eq(1) with m.Case(1): for c in range(self.num_channels): - m.d.sync += self.output.payload[c].eq(r_data_cast[c]) - m.d.sync += self.output.valid.eq(dly_fifo.r_rdy) - m.d.comb += dly_fifo.r_en.eq(1) - with m.If(dly_fifo.r_rdy): - m.d.sync += arm_index.eq(0) + m.d.sync += self.output.payload[c].eq(delayed[c]) + m.d.sync += self.output.valid.eq(1) + m.d.sync += arm_index.eq(0) + m.d.sync += busy.eq(0) if self._domain != "sync": m = DomainRenamer(self._domain)(m) @@ -231,12 +208,11 @@ class FIRFilterMAC16(wiring.Component): }) super().__init__(signature) - def taps_shape(self, taps=None): - taps = taps or self.taps - taps_as_ratios = [tap.as_integer_ratio() for tap in taps] + def taps_shape(self): + taps_as_ratios = [tap.as_integer_ratio() for tap in self.taps] f_width = bits_for(max(tap[1] for tap in taps_as_ratios)) - 1 i_width = max(0, bits_for(max(abs(tap[0]) for tap in taps_as_ratios)) - f_width) - return fixed.Shape(i_width, f_width, signed=any(tap < 0 for tap in taps)) + return fixed.Shape(i_width, f_width, signed=any(tap < 0 for tap in self.taps)) def compute_output_shape(self): taps_shape = self.taps_shape() @@ -253,105 +229,101 @@ class FIRFilterMAC16(wiring.Component): def elaborate(self, platform): m = Module() - # Build filter out of SerialMAC16 blocks. + # Build filter out of FIRFilterSerialMAC16 blocks. overclock_factor = self.overclock_rate - taps = self.taps + # Symmetric coefficients special case. + symmetric = (self.taps == self.taps[::-1]) + + # Even-symmetric case. (N=2*K) + # Odd-symmetric case. (N=2*K+1) + if symmetric: + taps = self.taps[:ceil(len(self.taps)/2)] + odd_symmetric = ((len(self.taps) % 2) == 1) + else: + taps = self.taps + + dsp_block_count = ceil(len(taps) / overclock_factor) + + + def pipe(signal, length): + name = signal.name if hasattr(signal, "name") else "signal" + pipe = [ signal ] + [ Signal.like(signal, name=f"{name}_q{i}") for i in range(length) ] + for i in range(length): + m.d.sync += pipe[i+1].eq(pipe[i]) + return pipe + if self.carry is not None: sum_carry_q = Signal.like(self.sum_carry) - - filters_ready = Signal() - window_valid = Signal() - input_ready = Signal() - m.d.comb += input_ready.eq(~window_valid | filters_ready) - if not self.input.signature.always_ready: - m.d.comb += self.input.ready.eq(input_ready) - - # Samples window. - window = [ Signal.like(self.input.p, name=f"window_{i}") for i in range(len(self.taps)) ] - - with m.If(input_ready): - m.d.sync += window_valid.eq(self.input.valid) - with m.If(self.input.valid): - m.d.sync += window[0].eq(self.input.p) - for i in range(1, len(window)): - m.d.sync += window[i].eq(window[i-1]) - if self.carry is not None: - m.d.sync += sum_carry_q.eq(self.sum_carry) - - # When filter is symmetric, presum samples to obtain a smaller window. - symmetric = (self.taps == self.taps[::-1]) - if symmetric: - sum_shape = (self.input.p[0] + self.input.p[0]).shape() - odd_symmetric = ((len(self.taps) % 2) == 1) - new_len = len(self.taps) // 2 + odd_symmetric - new_window = [ Signal(data.ArrayLayout(sum_shape, self.num_channels), name=f"window_sym_{i}") for i in range(new_len) ] - for i in range(len(new_window) - odd_symmetric): - for c in range(self.num_channels): - m.d.comb += new_window[i][c].eq(window[i][c] + window[-i-1][c]) - if odd_symmetric: - for c in range(self.num_channels): - m.d.comb += new_window[-1][c].eq(window[len(self.taps)//2][c]) - window = new_window - taps = self.taps[:ceil(len(self.taps)/2)] - samples_shape = sum_shape - else: - samples_shape = self.shape - - # Build filter out of SerialMAC16 blocks: each one multiplies and - # accumulates `overclock_factor` taps serially. - dsp_block_count = ceil(len(taps) / overclock_factor) - - # If we have multiple subfilters, make them all the same size. - if dsp_block_count > 1 and len(taps) % overclock_factor != 0: - taps = taps + [0]*(overclock_factor - (len(taps)%overclock_factor)) + with m.If(self.input.valid & self.input.ready): + m.d.sync += sum_carry_q.eq(self.sum_carry) for c in range(self.num_channels): + last = self.input dsp_blocks = [] for i in range(dsp_block_count): taps_slice = taps[i*overclock_factor:(i+1)*overclock_factor] - window_slice = window[i*overclock_factor:(i+1)*overclock_factor] - carry = None if i > 0 else self.carry + input_delayed = len(taps_slice) + carry = last.output.p.shape() if i > 0 else self.carry - dsp = SerialMAC16(taps=taps_slice, shape=samples_shape, taps_shape=self.taps_shape(taps), carry=carry, always_ready=self.always_ready) + if (i == dsp_block_count-1) and symmetric and odd_symmetric: + taps_slice[-1] /= 2 + input_delayed -= 1 + + dsp = FIRFilterSerialMAC16(taps=taps_slice, shape=self.shape, taps_shape=self.taps_shape(), carry=carry, symmetry=symmetric, + input_delayed_cycles=input_delayed, always_ready=self.always_ready) dsp_blocks.append(dsp) - for j in range(len(window_slice)): - m.d.comb += dsp.input.p[j].eq(window_slice[j][c]) - m.d.comb += dsp.input.valid.eq(window_valid) - if i == 0: - m.d.comb += filters_ready.eq(dsp.input.ready) + m.d.comb += [ + dsp.input.p .eq(self.input.p[c]), + dsp.input.valid .eq(self.input.valid & self.input.ready), + ] + if not self.input.signature.always_ready: + m.d.comb += self.input.ready.eq(dsp.input.ready) if self.carry is not None: m.d.comb += dsp.sum_carry.eq(sum_carry_q[c]) + else: + m.d.comb += [ + dsp.input.p .eq(pipe(last.input_delayed, last.delay())[-1]), + dsp.input.valid .eq(last.output.valid), + dsp.sum_carry .eq(last.output.p), + ] + if not last.output.signature.always_ready: + m.d.comb += last.output.ready.eq(dsp.input.ready) + + last = dsp + + if self.delayed_port: + m.d.comb += self.input_delayed[c].eq(last.input_delayed) + + if symmetric: + + for i in reversed(range(dsp_block_count)): + end_block = (i == dsp_block_count-1) + m.d.comb += [ + dsp_blocks[i].rev_input .eq(dsp_blocks[i+1].rev_delayed if not end_block else dsp_blocks[i].input_delayed), + ] m.submodules += dsp_blocks - # Adder tree for channel c - if dsp_block_count > 1: - with m.If(~self.output.valid | self.output.ready): - for i in range(dsp_block_count): - if not dsp_blocks[i].output.signature.always_ready: - m.d.comb += dsp_blocks[i].output.ready.eq(1) - m.d.sync += self.output.valid.eq(dsp_blocks[0].output.valid) - with m.If(dsp_blocks[0].output.valid): - m.d.sync += self.output.payload[c] .eq(sum(dsp_blocks[i].output.p for i in range(dsp_block_count))) - else: - m.d.comb += self.output.payload[c].eq(dsp_blocks[0].output.p) - m.d.comb += self.output.valid.eq(dsp_blocks[0].output.valid) - if not dsp_blocks[0].output.signature.always_ready: - m.d.comb += dsp_blocks[0].output.ready.eq(self.output.ready) + m.d.comb += [ + self.output.payload[c] .eq(last.output.p), + self.output.valid .eq(last.output.valid), + ] + if not last.output.signature.always_ready: + m.d.comb += last.output.ready.eq(self.output.ready) return m -class SerialMAC16(wiring.Component): +class FIRFilterSerialMAC16(wiring.Component): - def __init__(self, taps, shape, shape_out=None, taps_shape=None, carry=None, always_ready=False): - assert shape.as_shape().width <= 16, f"DSP slice inputs have a maximum width of 16 bit. {shape} {shape.as_shape().width}" + def __init__(self, taps, shape, shape_out=None, taps_shape=None, carry=None, symmetry=False, input_delayed_cycles=None, always_ready=False): + assert shape.as_shape().width <= 16, "DSP slice inputs have a maximum width of 16 bit." self.carry = carry self.taps = list(taps) @@ -361,8 +333,15 @@ class SerialMAC16(wiring.Component): shape_out = self.compute_output_shape() self.shape_out = shape_out self.always_ready = always_ready + self.symmetry = symmetry + if input_delayed_cycles is None: + self.input_delayed_cycles = len(self.taps) + else: + self.input_delayed_cycles = input_delayed_cycles + signature = { - "input": In(stream.Signature(data.ArrayLayout(shape, len(taps)), always_ready=always_ready)), + "input": In(stream.Signature(shape, always_ready=always_ready)), + "input_delayed": Out(shape), "output": Out(stream.Signature(shape_out, always_ready=always_ready)), } if carry is not None: @@ -371,6 +350,11 @@ class SerialMAC16(wiring.Component): }) else: self.sum_carry = 0 + if symmetry: + signature.update({ + "rev_input": In(shape), + "rev_delayed": Out(shape), + }) super().__init__(signature) def taps_shape(self): @@ -391,36 +375,72 @@ class SerialMAC16(wiring.Component): shape_out = fixed.Shape(i_width, f_width, signed=signed) return shape_out + def delay(self): + return 1 + 1 + 3 + len(self.taps) - 1 + def elaborate(self, platform): m = Module() depth = len(self.taps) counter_in = Signal(range(depth)) - dsp_ready = Signal() + counter_mult = Signal(range(depth)) + counter_out = Signal(range(depth)) + dsp_ready = ~self.output.valid | self.output.ready + + window_valid = Signal() + window_ready = dsp_ready multin_valid = Signal() + input_ready = Signal() # Ready to process a sample either when the DSP slice is ready and the samples window is: # - Not valid yet. # - Only valid for 1 more cycle. - m.d.comb += input_ready.eq((counter_in == depth-1) & dsp_ready) + m.d.comb += input_ready.eq(~window_valid | ((counter_in == depth-1) & window_ready)) if not self.input.signature.always_ready: m.d.comb += self.input.ready.eq(input_ready) + window = [ Signal.like(self.input.p, name=f"window_{i}") for i in range(max(depth, self.input_delayed_cycles)) ] + + # Sample window. + with m.If(input_ready): + m.d.sync += window_valid.eq(self.input.valid) + with m.If(self.input.valid): + m.d.sync += window[0].eq(self.input.p) + for i in range(1, len(window)): + m.d.sync += window[i].eq(window[i-1]) + + m.d.sync += multin_valid.eq(window_valid) + + dsp_a = Signal.like(self.input.p) + with m.Switch(counter_in): + for i in range(depth): + with m.Case(i): + m.d.sync += dsp_a.eq(window[i]) + + m.d.comb += self.input_delayed.eq(window[self.input_delayed_cycles-1]) + # Sample counter. - with m.If((self.input.valid | (counter_in != 0)) & dsp_ready): + with m.If(window_ready & window_valid): m.d.sync += counter_in.eq(_incr(counter_in, depth)) - with m.If(dsp_ready): - m.d.sync += multin_valid.eq(self.input.valid | (counter_in != 0)) + # Symmetry handling. + if self.symmetry: - # Select sample from window. - dsp_a = Signal(self.shape) - with m.If(dsp_ready): + window_rev = [ Signal.like(self.input.p, name=f"window_rev_{i}") for i in range(depth) ] + + with m.If(input_ready & self.input.valid): + m.d.sync += window_rev[0].eq(self.rev_input) + m.d.sync += [ window_rev[i].eq(window_rev[i-1]) for i in range(1, len(window_rev)) ] + + m.d.comb += self.rev_delayed.eq(window_rev[-1]) + + dsp_a_rev = Signal.like(self.input.p) with m.Switch(counter_in): for i in range(depth): with m.Case(i): - m.d.sync += dsp_a.eq(self.input.p[i]) + m.d.sync += dsp_a_rev.eq(window_rev[depth-1-i]) + # Coefficient ROM. taps_shape = self.taps_shape @@ -433,38 +453,33 @@ class SerialMAC16(wiring.Component): m.submodules.coeff_rom = coeff_rom = memory.Memory(data=coeff_data) coeff_rd = coeff_rom.read_port(domain="sync") m.d.comb += coeff_rd.addr.eq(counter_in) - m.d.comb += coeff_rd.en.eq(dsp_ready) shape_out = self.compute_output_shape() if self.carry: sum_carry_q = Signal.like(self.sum_carry) - with m.If(input_ready): + with m.If(self.input.ready & self.input.valid): m.d.sync += sum_carry_q.eq(self.sum_carry) - m.submodules.dsp = dsp = iCE40Multiplier( - o_width=shape_out.as_shape().width, - always_ready=self.always_ready) - - valid_cnt = Signal(depth, init=1) - mult_cnt = Signal(depth, init=1) + m.submodules.dsp = dsp = iCE40Multiplier() + if self.symmetry: + m.d.comb += dsp.a.eq(dsp_a + dsp_a_rev) + else: + m.d.comb += dsp.a.eq(dsp_a) m.d.comb += [ - dsp.a .eq(dsp_a), dsp.b .eq(coeff_rd.data), shape_out(dsp.p) .eq(sum_carry_q if self.carry is not None else 0), - dsp.valid_in .eq(multin_valid), - dsp_ready .eq(dsp.ready_in), - dsp.p_load .eq(mult_cnt[0]), + dsp.valid_in .eq(multin_valid & window_ready), + dsp.p_load .eq(counter_mult == 0), self.output.p .eq(shape_out(dsp.o)), - self.output.valid .eq(dsp.valid_out & valid_cnt[-1]), - dsp.ready_out .eq(self.output.ready | ~valid_cnt[-1]), + self.output.valid .eq(dsp.valid_out & (counter_out == depth-1)), ] # Multiplier input and output counters. - with m.If(dsp.valid_in & dsp.ready_in): - m.d.sync += mult_cnt.eq(mult_cnt.rotate_left(1)) - with m.If(dsp.valid_out & dsp.ready_out): - m.d.sync += valid_cnt.eq(valid_cnt.rotate_left(1)) + with m.If(dsp.valid_in): + m.d.sync += counter_mult.eq(_incr(counter_mult, depth)) + with m.If(dsp.valid_out): + m.d.sync += counter_out.eq(_incr(counter_out, depth)) return m @@ -472,20 +487,15 @@ class SerialMAC16(wiring.Component): class iCE40Multiplier(wiring.Component): - def __init__(self, a_width=16, b_width=16, p_width=32, o_width=32, always_ready=False): - super().__init__({ - "a": In(signed(a_width)), - "b": In(signed(b_width)), - "valid_in": In(1), - "ready_in": In(1), - "p": In(signed(p_width)), - "p_load": In(1), - "o": Out(signed(o_width)), - "valid_out": Out(1), - "ready_out": In(1), - }) - self.always_ready = always_ready - self.o_width = o_width + a: In(signed(16)) + b: In(signed(16)) + valid_in: In(1) + + p: In(signed(32)) + p_load: In(1) + + o: Out(signed(32)) + valid_out: Out(1) def elaborate(self, platform): m = Module() @@ -497,20 +507,13 @@ class iCE40Multiplier(wiring.Component): return pipe p_load_v = Signal() - valid_v = Signal() - m.d.comb += valid_v.eq(self.valid_in & self.ready_in) dsp_delay = 3 - valid_pipe = pipe(valid_v, dsp_delay) - m.d.comb += p_load_v.eq(self.p_load & valid_v) + valid_pipe = pipe(self.valid_in, dsp_delay) + m.d.comb += p_load_v.eq(self.p_load & self.valid_in) p_pipe = pipe(self.p, dsp_delay-1) p_load_pipe = pipe(p_load_v, dsp_delay - 1) - - # skid buffer - if not self.always_ready: - m.submodules.out_fifo = out_fifo = fifo.SyncFIFOBuffered(width=self.o_width, depth=dsp_delay+2) - - m.d.comb += self.ready_in.eq(~self.valid_out | self.ready_out) + m.d.comb += self.valid_out.eq(valid_pipe[dsp_delay]) m.submodules.sb_mac16 = mac = SB_MAC16( C_REG=0, @@ -538,10 +541,10 @@ class iCE40Multiplier(wiring.Component): # Inputs. mac.CLK .eq(ClockSignal("sync")), mac.CE .eq(1), - mac.C.as_signed().eq(Mux(p_load_pipe[2], p_pipe[2][16:], mac.O[16:])), - mac.A.as_signed().eq(self.a), - mac.B.as_signed().eq(self.b), - mac.D.as_signed().eq(Mux(p_load_pipe[2], p_pipe[2][:16], mac.O[:16])), + mac.C .eq(Mux(p_load_pipe[2], p_pipe[2][16:], self.o[16:])), + mac.A .eq(self.a), + mac.B .eq(self.b), + mac.D .eq(Mux(p_load_pipe[2], p_pipe[2][:16], self.o[:16])), mac.AHOLD .eq(~valid_pipe[0]), # 0: load mac.BHOLD .eq(~valid_pipe[0]), mac.CHOLD .eq(0), @@ -552,23 +555,11 @@ class iCE40Multiplier(wiring.Component): mac.ADDSUBBOT .eq(0), mac.OLOADTOP .eq(0), mac.OLOADBOT .eq(0), + + # Outputs. + self.o .eq(mac.O), ] - if not self.always_ready: - m.d.comb += [ - out_fifo.w_data.eq(mac.O), - out_fifo.w_en.eq(valid_pipe[dsp_delay]), - - self.o.eq(out_fifo.r_data), - self.valid_out.eq(out_fifo.r_rdy), - out_fifo.r_en.eq(self.ready_out), - ] - else: - m.d.comb += [ - self.o.eq(mac.O), - self.valid_out.eq(valid_pipe[dsp_delay]), - ] - return m @@ -602,7 +593,7 @@ class _TestFilter(unittest.TestCase): return samples / (1 << f_width) return samples - def _filter(self, dut, samples, count, num_channels=1, outfile=None, empty_cycles=0, empty_ready_cycles=0): + def _filter(self, dut, samples, count, num_channels=1, outfile=None, empty_cycles=0): async def input_process(ctx): if hasattr(dut, "enable"): @@ -636,10 +627,6 @@ class _TestFilter(unittest.TestCase): filtered.append(payload[0].as_float()) else: filtered.append(payload.as_float()) - if empty_ready_cycles > 0: - ctx.set(dut.output.ready, 0) - await ctx.tick().repeat(empty_ready_cycles) - ctx.set(dut.output.ready, 1) if not dut.output.signature.always_ready: ctx.set(dut.output.ready, 0) @@ -658,6 +645,23 @@ class _TestFilter(unittest.TestCase): class TestFIRFilterMAC16(_TestFilter): + def test_filter_serial(self): + taps = [-1, 0, 9, 16, 9, 0, -1] + taps = [ tap / 32 for tap in taps ] + + num_samples = 1024 + input_width = 8 + input_samples = self._generate_samples(num_samples, input_width) + + # Compute the expected result + filtered_np = np.convolve(input_samples, taps).tolist() + + # Simulate DUT + dut = FIRFilterSerialMAC16(taps, fixed.SQ(15, 0), always_ready=False) + filtered = self._filter(dut, input_samples, len(input_samples)) + + self.assertListEqual(filtered_np[:len(filtered)], filtered) + def test_filter(self): taps = [-1, 0, 9, 16, 9, 0, -1] taps = [ tap / 32 for tap in taps ] @@ -670,8 +674,8 @@ class TestFIRFilterMAC16(_TestFilter): filtered_np = np.convolve(input_samples, taps).tolist() # Simulate DUT - dut = FIRFilterMAC16(taps, shape=fixed.SQ(8, 0), always_ready=False) - filtered = self._filter(dut, input_samples, len(input_samples), empty_ready_cycles=5) + dut = FIRFilterMAC16(taps, fixed.SQ(15, 0), always_ready=False) + filtered = self._filter(dut, input_samples, len(input_samples)) self.assertListEqual(filtered_np[:len(filtered)], filtered) @@ -713,7 +717,7 @@ class TestHalfBandDecimatorMAC16(_TestFilter): "test_filter_no_backpressure_and_empty_cycles_taps1": { "num_samples": 1024, "dut_options": dict(**common_dut_options, always_ready=True, taps=taps0), - "sim_opts": dict(empty_cycles=6), + "sim_opts": dict(empty_cycles=3), }, "test_filter_no_backpressure": { @@ -764,20 +768,20 @@ class TestHalfBandInterpolatorMAC16(_TestFilter): "test_filter_with_backpressure": { "num_samples": 1024, - "dut_options": dict(**common_dut_options, always_ready=False, num_channels=2, taps=taps1), - "sim_opts": dict(empty_cycles=0, empty_ready_cycles=0), + "dut_options": dict(**common_dut_options, always_ready=False, num_channels=2, taps=taps0), + "sim_opts": dict(empty_cycles=0), }, "test_filter_with_backpressure_and_empty_cycles": { "num_samples": 1024, "dut_options": dict(**common_dut_options, num_channels=2, always_ready=False, taps=taps0), - "sim_opts": dict(empty_ready_cycles=7, empty_cycles=3), + "sim_opts": dict(empty_cycles=3), }, "test_filter_with_backpressure_taps1": { "num_samples": 1024, "dut_options": dict(**common_dut_options, num_channels=2, always_ready=False, taps=taps1), - "sim_opts": dict(empty_ready_cycles=7, empty_cycles=0), + "sim_opts": dict(empty_cycles=0), }, "test_filter_no_backpressure_and_empty_cycles_taps1": { diff --git a/firmware/fpga/interface/__init__.py b/firmware/fpga/interface/__init__.py index 530d7af8..a19e3fc2 100644 --- a/firmware/fpga/interface/__init__.py +++ b/firmware/fpga/interface/__init__.py @@ -1,3 +1 @@ -from .max586x import MAX586xInterface -from .spi import SPIRegisterInterface -from .sgpio import SGPIOInterface +from .max586x import MAX586xInterface \ No newline at end of file diff --git a/firmware/fpga/interface/max586x.py b/firmware/fpga/interface/max586x.py index 60ffade9..b94d2152 100644 --- a/firmware/fpga/interface/max586x.py +++ b/firmware/fpga/interface/max586x.py @@ -9,11 +9,13 @@ from amaranth.lib.wiring import Out, In from util import IQSample - class MAX586xInterface(wiring.Component): - adc_stream: Out(stream.Signature(IQSample(8), always_ready=True, always_valid=True)) + adc_stream: Out(stream.Signature(IQSample(8), always_ready=True)) dac_stream: In(stream.Signature(IQSample(8), always_ready=True)) - q_invert: In(1) + + adc_capture: In(1) + dac_capture: In(1) + q_invert: In(1) def __init__(self, bb_domain): super().__init__() @@ -45,9 +47,10 @@ class MAX586xInterface(wiring.Component): m.d.comb += [ adc_stream.p.i .eq(adc_in.i[0] ^ 0x80), # I: non-inverted between MAX2837 and MAX5864. adc_stream.p.q .eq(adc_in.i[1] ^ rx_q_mask), # Q: inverted between MAX2837 and MAX5864. + adc_stream.valid .eq(self.adc_capture), ] - # Output to the DAC using a DDR output buffer. + # Output the transformed data to the DAC using a DDR output buffer. m.submodules.dac_out = dac_out = io.DDRBuffer("o", platform.request("dd", dir="-"), o_domain=self._bb_domain) with m.If(dac_stream.valid): m.d.comb += [ diff --git a/firmware/fpga/interface/sgpio.py b/firmware/fpga/interface/sgpio.py deleted file mode 100644 index cca0b116..00000000 --- a/firmware/fpga/interface/sgpio.py +++ /dev/null @@ -1,204 +0,0 @@ -# -# This file is part of HackRF. -# -# Copyright (c) 2025 Great Scott Gadgets -# SPDX-License-Identifier: BSD-3-Clause - -from amaranth import Module, Signal, DomainRenamer, EnableInserter, ClockSignal, Instance -from amaranth.lib import io, fifo, stream, wiring, cdc -from amaranth.lib.wiring import Out, In - -from util import LinearFeedbackShiftRegister - - -class SGPIOInterface(wiring.Component): - - def __init__(self, sample_width=8, rx_assignments=None, tx_assignments=None, domain="sync"): - self.sample_width = sample_width - if rx_assignments is None: - rx_assignments = _default_rx_assignments(sample_width // 8) - if tx_assignments is None: - tx_assignments = _default_tx_assignments(sample_width // 8) - self.rx_assignments = rx_assignments - self.tx_assignments = tx_assignments - self._domain = domain - super().__init__({ - "adc_stream": In(stream.Signature(sample_width, always_ready=True)), - "dac_stream": Out(stream.Signature(sample_width)), - "trigger_en": In(1), - "prbs": In(1), - }) - - def elaborate(self, platform): - m = Module() - - adc_stream = self.adc_stream - dac_stream = self.dac_stream - rx_cycles = len(self.rx_assignments) - tx_cycles = len(self.tx_assignments) - - direction_i = platform.request("direction").i - enable_i = ~platform.request("disable").i - capture_en = platform.request("capture_en").o - - # Determine data transfer direction. - direction = Signal() - m.submodules.direction_cdc = cdc.FFSynchronizer(direction_i, direction, o_domain=self._domain) - transfer_from_adc = (direction == 0) - - # SGPIO clock and data lines. - tx_clk_en = Signal() - rx_clk_en = Signal() - data_to_host = Signal(self.sample_width) - byte_to_host = Signal(8) - data_from_host = Signal(self.sample_width) - byte_from_host = Signal(8) - - m.submodules.clk_out = clk_out = io.DDRBuffer("o", platform.request("host_clk", dir="-"), o_domain=self._domain) - m.submodules.host_io = host_io = io.DDRBuffer('io', platform.request("host_data", dir="-"), i_domain=self._domain, o_domain=self._domain) - - m.d.sync += clk_out.o[0].eq(tx_clk_en) - m.d.sync += clk_out.o[1].eq(rx_clk_en) - m.d.sync += host_io.oe.eq(transfer_from_adc) - m.d.comb += host_io.o[0].eq(byte_to_host) - m.d.comb += host_io.o[1].eq(byte_to_host) - m.d.comb += byte_from_host.eq(host_io.i[1]) - - # Transmission is handled differently to account for the latency before the data - # becomes available in the FPGA fabric. - ddr_in_latency = 2 # for iCE40 DDR inputs in Amaranth. - tx_write_latency = tx_cycles + ddr_in_latency - tx_write_pipe = Signal(tx_write_latency) - m.d.sync += tx_write_pipe.eq(tx_write_pipe << 1) - for i in range(tx_cycles-1): # don't store last byte - with m.If(tx_write_pipe[ddr_in_latency + i]): - m.d.sync += self.tx_assignments[i](data_from_host, byte_from_host) - - # Small TX FIFO to avoid missing samples when the consumer deasserts its ready - # signal and transfers are in progress. - m.submodules.tx_fifo = tx_fifo = fifo.SyncFIFOBuffered(width=self.sample_width, depth=16) - m.d.comb += [ - tx_fifo.w_data .eq(data_from_host), - self.tx_assignments[-1](tx_fifo.w_data, byte_from_host), - tx_fifo.w_en .eq(tx_write_pipe[-1]), - dac_stream.p .eq(tx_fifo.r_data), - dac_stream.valid .eq(tx_fifo.r_rdy), - tx_fifo.r_en .eq(dac_stream.ready), - ] - - # Pseudo-random binary sequence generator. - prbs_advance = Signal() - prbs_count = Signal(2) - m.submodules.prbs = prbs = EnableInserter(prbs_advance)( - LinearFeedbackShiftRegister(degree=8, taps=[8,6,5,4], init=0b10110001)) - - - # Capture signal generation. - capture = Signal() - m.submodules.trigger_gen = trigger_gen = FlowAndTriggerControl(domain=self._domain) - m.d.comb += [ - trigger_gen.enable.eq(enable_i), - trigger_gen.trigger_en.eq(self.trigger_en), - capture.eq(trigger_gen.capture), - ] - - - # Main state machine. - with m.FSM(): - with m.State("IDLE"): - - m.d.sync += capture_en.eq(capture) - - with m.If(transfer_from_adc): - with m.If(self.prbs): - m.d.sync += capture_en.eq(1) - m.next = "PRBS" - with m.Elif(adc_stream.valid & capture): - m.d.comb += rx_clk_en.eq(1) - m.d.sync += data_to_host.eq(adc_stream.p) - m.d.sync += byte_to_host.eq(self.rx_assignments[0](adc_stream.p)) - if rx_cycles > 1: - m.next = "RX0" - with m.Else(): - with m.If(dac_stream.ready & capture): - m.d.comb += tx_clk_en.eq(1) - m.d.sync += tx_write_pipe[0].eq(capture) - if tx_cycles > 1: - m.next = "TX0" - - for i in range(rx_cycles-1): - with m.State(f"RX{i}"): - m.d.comb += rx_clk_en.eq(1) - m.d.sync += byte_to_host.eq(self.rx_assignments[i+1](data_to_host)) - m.next = "IDLE" if i == rx_cycles-2 else f"RX{i+1}" - - for i in range(tx_cycles-1): - with m.State(f"TX{i}"): - m.d.comb += tx_clk_en.eq(1) - m.next = "IDLE" if i == tx_cycles-2 else f"TX{i+1}" - - with m.State("PRBS"): - m.d.comb += rx_clk_en.eq(prbs_count == 0) - m.d.comb += prbs_advance.eq(prbs_count == 0) - m.d.sync += byte_to_host.eq(prbs.value) - m.d.sync += prbs_count.eq(prbs_count + 1) - with m.If(~self.prbs): - m.next = "IDLE" - - # Convert to other clock domain if necessary. - if self._domain != "sync": - m = DomainRenamer(self._domain)(m) - - return m - - -def _default_rx_assignments(n): - def rx_assignment(i): - def _f(w): - return w.word_select(i, 8) - return _f - return [ rx_assignment(i) for i in range(n) ] - -def _default_tx_assignments(n): - def tx_assignment(i): - def _f(w, v): - return w.word_select(i, 8).eq(v) - return _f - return [ tx_assignment(i) for i in range(n) ] - - -class FlowAndTriggerControl(wiring.Component): - trigger_en: In(1) - enable: In(1) - capture: Out(1) - - def __init__(self, domain): - super().__init__() - self._domain = domain - - def elaborate(self, platform): - m = Module() - - # - # Signal synchronization and trigger logic. - # - trigger_enable = self.trigger_en - trigger_in = platform.request("trigger_in").i - trigger_out = platform.request("trigger_out").o - m.d.comb += trigger_out.eq(self.enable) - - # Create a latch for the trigger input signal using a special FPGA primitive. - trigger_in_latched = Signal() - trigger_in_reg = Instance("SB_DFFES", - i_D = 0, - i_S = trigger_in, # async set - i_E = ~self.enable, - i_C = ClockSignal(self._domain), - o_Q = trigger_in_latched - ) - m.submodules.trigger_in_reg = trigger_in_reg - - # Export signal for capture gating. - m.d[self._domain] += self.capture.eq(self.enable & (trigger_in_latched | ~trigger_enable)) - - return m diff --git a/firmware/fpga/requirements.txt b/firmware/fpga/requirements.txt index 87b248b0..4b676b22 100644 --- a/firmware/fpga/requirements.txt +++ b/firmware/fpga/requirements.txt @@ -1,4 +1,3 @@ amaranth==v0.5.8 amaranth-boards @ git+https://github.com/amaranth-lang/amaranth-boards.git@23c66d6 lz4 -numpy diff --git a/firmware/fpga/top/ext_precision_rx.py b/firmware/fpga/top/ext_precision_rx.py index 3950458e..6eb3f138 100644 --- a/firmware/fpga/top/ext_precision_rx.py +++ b/firmware/fpga/top/ext_precision_rx.py @@ -4,13 +4,15 @@ # Copyright (c) 2025 Great Scott Gadgets # SPDX-License-Identifier: BSD-3-Clause -from amaranth import Elaboratable, Module, Cat, DomainRenamer -from amaranth.lib.wiring import connect +from amaranth import Elaboratable, Module, Signal, Mux, Instance, Cat, ClockSignal, DomainRenamer +from amaranth.lib import io, fifo, stream, wiring +from amaranth.lib.wiring import Out, In, connect from amaranth_future import fixed from board import PralinePlatform, ClockDomainGenerator -from interface import MAX586xInterface, SGPIOInterface, SPIRegisterInterface +from interface import MAX586xInterface +from interface.spi import SPIRegisterInterface from dsp.fir import FIRFilter from dsp.fir_mac16 import HalfBandDecimatorMAC16 from dsp.cic import CICDecimator @@ -19,6 +21,119 @@ from dsp.quarter_shift import QuarterShift from util import ClockConverter, IQSample +class MCUInterface(wiring.Component): + adc_stream: In(stream.Signature(IQSample(12), always_ready=True)) + direction: In(1) + enable: In(1) + + def __init__(self, domain="sync"): + self._domain = domain + super().__init__() + + def elaborate(self, platform): + m = Module() + + adc_stream = self.adc_stream + + # Determine data transfer direction. + direction = Signal() + enable = Signal() + m.d.sync += enable.eq(self.enable) + m.d.sync += direction.eq(self.direction) + transfer_from_adc = (direction == 0) + + # SGPIO clock and data lines. + m.submodules.clk_out = clk_out = io.DDRBuffer("o", platform.request("host_clk", dir="-"), o_domain=self._domain) + m.submodules.host_io = host_io = io.DDRBuffer('io', platform.request("host_data", dir="-"), i_domain=self._domain, o_domain=self._domain) + + # State machine to control SGPIO clock and data lines. + rx_clk_en = Signal() + m.d.sync += clk_out.o[1].eq(rx_clk_en) + m.d.sync += host_io.oe.eq(transfer_from_adc) + + data_to_host = Signal.like(adc_stream.p) + rx_data_buffer = Signal(8) + m.d.comb += host_io.o[0].eq(rx_data_buffer) + m.d.comb += host_io.o[1].eq(rx_data_buffer) + + with m.FSM(): + with m.State("IDLE"): + m.d.comb += rx_clk_en.eq(enable & transfer_from_adc & adc_stream.valid) + + with m.If(rx_clk_en): + m.d.sync += rx_data_buffer.eq(adc_stream.p.i >> 8) + m.d.sync += data_to_host.eq(adc_stream.p) + m.next = "RX_I1" + + with m.State("RX_I1"): + m.d.comb += rx_clk_en.eq(1) + m.d.sync += rx_data_buffer.eq(data_to_host.i) + m.next = "RX_Q0" + + with m.State("RX_Q0"): + m.d.comb += rx_clk_en.eq(1) + m.d.sync += rx_data_buffer.eq(data_to_host.q >> 8) + m.next = "RX_Q1" + + with m.State("RX_Q1"): + m.d.comb += rx_clk_en.eq(1) + m.d.sync += rx_data_buffer.eq(data_to_host.q) + m.next = "IDLE" + + if self._domain != "sync": + m = DomainRenamer(self._domain)(m) + + return m + + +class FlowAndTriggerControl(wiring.Component): + trigger_en: In(1) + direction: Out(1) # async + enable: Out(1) # async + adc_capture: Out(1) + dac_capture: Out(1) + + def __init__(self, domain): + super().__init__() + self._domain = domain + + def elaborate(self, platform): + m = Module() + + # + # Signal synchronization and trigger logic. + # + trigger_enable = self.trigger_en + trigger_in = platform.request("trigger_in").i + trigger_out = platform.request("trigger_out").o + host_data_enable = ~platform.request("disable").i + m.d.comb += trigger_out.eq(host_data_enable) + + # Create a latch for the trigger input signal using a special FPGA primitive. + trigger_in_latched = Signal() + trigger_in_reg = Instance("SB_DFFES", + i_D = 0, + i_S = trigger_in, # async set + i_E = ~host_data_enable, + i_C = ClockSignal(self._domain), + o_Q = trigger_in_latched + ) + m.submodules.trigger_in_reg = trigger_in_reg + + # Export signals for direction control and capture gating. + m.d.comb += self.direction.eq(platform.request("direction").i) + m.d.comb += self.enable.eq(host_data_enable) + + with m.If(host_data_enable): + m.d[self._domain] += self.adc_capture.eq((trigger_in_latched | ~trigger_enable) & (self.direction == 0)) + m.d[self._domain] += self.dac_capture.eq((trigger_in_latched | ~trigger_enable) & (self.direction == 1)) + with m.Else(): + m.d[self._domain] += self.adc_capture.eq(0) + m.d[self._domain] += self.dac_capture.eq(0) + + return m + + class Top(Elaboratable): def elaborate(self, platform): @@ -27,25 +142,15 @@ class Top(Elaboratable): m.submodules.clkgen = ClockDomainGenerator() # Submodules. + m.submodules.flow_ctl = flow_ctl = FlowAndTriggerControl(domain="gck1") m.submodules.adcdac_intf = adcdac_intf = MAX586xInterface(bb_domain="gck1") - m.submodules.mcu_intf = mcu_intf = SGPIOInterface( - sample_width=24, - rx_assignments=[ - lambda w: Cat(w[8:12], w[11].replicate(4)), - lambda w: w[0:8], - lambda w: Cat(w[20:24], w[23].replicate(4)), - lambda w: w[12:20], - ], - tx_assignments=[ - lambda w, v: w[8:12].eq(v), - lambda w, v: w[0:8].eq(v), - lambda w, v: w[20:24].eq(v), - lambda w, v: w[12:20].eq(v), - ], - domain="sync" - ) + m.submodules.mcu_intf = mcu_intf = MCUInterface(domain="sync") + m.d.comb += adcdac_intf.adc_capture.eq(flow_ctl.adc_capture) + m.d.comb += adcdac_intf.dac_capture.eq(flow_ctl.dac_capture) m.d.comb += adcdac_intf.q_invert.eq(platform.request("q_invert").i) + m.d.comb += mcu_intf.direction.eq(flow_ctl.direction) + m.d.comb += mcu_intf.enable.eq(flow_ctl.enable) # Half-band filter taps. taps_hb1 = [-2, 0, 5, 0, -10, 0,18, 0, -30, 0,53, 0,-101, 0, 323, 512, 323, 0,-101, 0, 53, 0, -30, 0,18, 0, -10, 0, 5, 0,-2] @@ -68,7 +173,7 @@ class Top(Elaboratable): "hbfir2": HalfBandDecimatorMAC16(taps_hb2, data_shape=fixed.SQ(11), overclock_rate=8, always_ready=True, domain="gck1"), # Clock domain conversion. - "clkconv": ClockConverter(IQSample(12), 8, "gck1", "sync", always_ready=True), + "clkconv": ClockConverter(IQSample(12), 4, "gck1", "sync", always_ready=True), } for k,v in rx_chain.items(): m.submodules[f"rx_{k}"] = v @@ -91,7 +196,7 @@ class Top(Elaboratable): m.d.comb += [ # Trigger enable. - mcu_intf.trigger_en .eq(ctrl[7]), + flow_ctl.trigger_en .eq(ctrl[7]), # RX settings. rx_chain["dc_block"].enable .eq(ctrl[0]), diff --git a/firmware/fpga/top/ext_precision_tx.py b/firmware/fpga/top/ext_precision_tx.py index 6b55acc4..4268606d 100644 --- a/firmware/fpga/top/ext_precision_tx.py +++ b/firmware/fpga/top/ext_precision_tx.py @@ -4,19 +4,140 @@ # Copyright (c) 2025 Great Scott Gadgets # SPDX-License-Identifier: BSD-3-Clause -from amaranth import Elaboratable, Module, Cat, DomainRenamer -from amaranth.lib.wiring import connect +from amaranth import Elaboratable, Module, Signal, Instance, Cat, ClockSignal, DomainRenamer +from amaranth.lib import io, fifo, stream, wiring +from amaranth.lib.wiring import Out, In, connect from amaranth_future import fixed from board import PralinePlatform, ClockDomainGenerator -from interface import MAX586xInterface, SGPIOInterface, SPIRegisterInterface +from interface import MAX586xInterface +from interface.spi import SPIRegisterInterface from dsp.fir import FIRFilter from dsp.fir_mac16 import HalfBandInterpolatorMAC16 from dsp.cic import CICInterpolator from util import ClockConverter, IQSample, StreamSkidBuffer +class MCUInterface(wiring.Component): + dac_stream: Out(stream.Signature(IQSample(12))) + direction: In(1) + enable: In(1) + + def __init__(self, domain="sync"): + self._domain = domain + super().__init__() + + def elaborate(self, platform): + m = Module() + + dac_stream = self.dac_stream + + # Determine data transfer direction. + direction = Signal() + enable = Signal() + m.d.sync += enable.eq(self.enable) + m.d.sync += direction.eq(self.direction) + transfer_to_dac = (direction == 1) + + # SGPIO clock and data lines. + m.submodules.clk_out = clk_out = io.DDRBuffer("o", platform.request("host_clk", dir="-"), o_domain=self._domain) + m.submodules.host_io = host_io = io.DDRBuffer('io', platform.request("host_data", dir="-"), i_domain=self._domain, o_domain=self._domain) + + # State machine to control SGPIO clock and data lines. + tx_clk_en = Signal() + m.d.sync += clk_out.o[0].eq(tx_clk_en) + + tx_dly_write = Signal(4) + tx_in_sample = Signal(4*8) + m.d.sync += tx_dly_write.eq(tx_dly_write << 1) + m.d.sync += tx_in_sample.eq(Cat(host_io.i[1], tx_in_sample)) + + # Small TX FIFO to avoid overflows from the write delay. + m.submodules.tx_fifo = tx_fifo = fifo.SyncFIFOBuffered(width=24, depth=4) + m.d.comb += [ + tx_fifo.w_data.word_select(0, 12) .eq(tx_in_sample[20:32]), + tx_fifo.w_data.word_select(1, 12) .eq(tx_in_sample[4:16]), + tx_fifo.w_en .eq(tx_dly_write[-1]), + dac_stream.p .eq(tx_fifo.r_data), + dac_stream.valid .eq(tx_fifo.r_rdy), + tx_fifo.r_en .eq(dac_stream.ready), + ] + + with m.FSM(): + with m.State("IDLE"): + m.d.comb += tx_clk_en.eq(enable & transfer_to_dac & dac_stream.ready) + + with m.If(tx_clk_en): + m.next = "TX_I1" + + with m.State("TX_I1"): + m.d.comb += tx_clk_en.eq(1) + m.next = "TX_Q0" + + with m.State("TX_Q0"): + m.d.comb += tx_clk_en.eq(1) + m.next = "TX_Q1" + + with m.State("TX_Q1"): + m.d.comb += tx_clk_en.eq(1) + m.d.sync += tx_dly_write[0].eq(1) # delayed write + m.next = "IDLE" + + if self._domain != "sync": + m = DomainRenamer(self._domain)(m) + + return m + + +class FlowAndTriggerControl(wiring.Component): + trigger_en: In(1) + direction: Out(1) # async + enable: Out(1) # async + adc_capture: Out(1) + dac_capture: Out(1) + + def __init__(self, domain): + super().__init__() + self._domain = domain + + def elaborate(self, platform): + m = Module() + + # + # Signal synchronization and trigger logic. + # + trigger_enable = self.trigger_en + trigger_in = platform.request("trigger_in").i + trigger_out = platform.request("trigger_out").o + host_data_enable = ~platform.request("disable").i + m.d.comb += trigger_out.eq(host_data_enable) + + # Create a latch for the trigger input signal using a special FPGA primitive. + trigger_in_latched = Signal() + trigger_in_reg = Instance("SB_DFFES", + i_D = 0, + i_S = trigger_in, # async set + i_E = ~host_data_enable, + i_C = ClockSignal(self._domain), + o_Q = trigger_in_latched + ) + m.submodules.trigger_in_reg = trigger_in_reg + + # Export signals for direction control and capture gating. + m.d.comb += self.direction.eq(platform.request("direction").i) + m.d.comb += self.enable.eq(host_data_enable) + + with m.If(host_data_enable): + m.d[self._domain] += self.adc_capture.eq((trigger_in_latched | ~trigger_enable) & (self.direction == 0)) + m.d[self._domain] += self.dac_capture.eq((trigger_in_latched | ~trigger_enable) & (self.direction == 1)) + with m.Else(): + m.d[self._domain] += self.adc_capture.eq(0) + m.d[self._domain] += self.dac_capture.eq(0) + + return m + + class Top(Elaboratable): def elaborate(self, platform): @@ -25,27 +146,15 @@ class Top(Elaboratable): m.submodules.clkgen = ClockDomainGenerator() # Submodules. + m.submodules.flow_ctl = flow_ctl = FlowAndTriggerControl(domain="gck1") m.submodules.adcdac_intf = adcdac_intf = MAX586xInterface(bb_domain="gck1") - m.submodules.mcu_intf = mcu_intf = SGPIOInterface( - sample_width=24, - rx_assignments=[ - lambda w: Cat(w[8:12], w[11].replicate(4)), - lambda w: w[0:8], - lambda w: Cat(w[20:24], w[23].replicate(4)), - lambda w: w[12:20], - ], - tx_assignments=[ - lambda w, v: w[8:12].eq(v), - lambda w, v: w[0:8].eq(v), - lambda w, v: w[20:24].eq(v), - lambda w, v: w[12:20].eq(v), - ], - domain="sync" - ) + m.submodules.mcu_intf = mcu_intf = MCUInterface(domain="sync") + m.d.comb += adcdac_intf.dac_capture.eq(flow_ctl.dac_capture) m.d.comb += adcdac_intf.q_invert.eq(platform.request("q_invert").i) + m.d.comb += mcu_intf.direction.eq(flow_ctl.direction) + m.d.comb += mcu_intf.enable.eq(flow_ctl.enable) - # Half-band filter taps. taps_hb1 = [-2, 0, 5, 0, -10, 0,18, 0, -30, 0,53, 0,-101, 0, 323, 512, 323, 0,-101, 0, 53, 0, -30, 0,18, 0, -10, 0, 5, 0,-2] taps_hb1 = [ tap/1024 for tap in taps_hb1 ] @@ -55,7 +164,7 @@ class Top(Elaboratable): tx_chain = { # Clock domain conversion. - "clkconv": ClockConverter(IQSample(12), 8, "sync", "gck1", always_ready=False), + "clkconv": ClockConverter(IQSample(12), 4, "sync", "gck1", always_ready=False), # Half-band interpolation stages (+ skid buffers for timing closure). "hbfir1": HalfBandInterpolatorMAC16(taps_hb1, data_shape=fixed.SQ(11), @@ -67,6 +176,7 @@ class Top(Elaboratable): # CIC interpolation stage. "cic_comp": DomainRenamer("gck1")(FIRFilter([-0.125, 0, 0.75, 0, -0.125], shape=fixed.SQ(11), shape_out=fixed.SQ(11), always_ready=False, num_channels=2)), + "cic_interpolator": CICInterpolator(2, 4, (4, 8, 16, 32), 12, 8, num_channels=2, always_ready=False, domain="gck1"), } @@ -91,7 +201,7 @@ class Top(Elaboratable): m.d.comb += [ # Trigger enable. - mcu_intf.trigger_en .eq(ctrl[7]), + flow_ctl.trigger_en .eq(ctrl[7]), # TX interpolation rate. tx_chain["cic_interpolator"].factor .eq(tx_intrp + 2), diff --git a/firmware/fpga/top/half_precision.py b/firmware/fpga/top/half_precision.py index 974f68ee..4cc0e20b 100644 --- a/firmware/fpga/top/half_precision.py +++ b/firmware/fpga/top/half_precision.py @@ -5,17 +5,128 @@ # Copyright (c) 2024 Great Scott Gadgets # SPDX-License-Identifier: BSD-3-Clause -from amaranth import Elaboratable, Module, DomainRenamer -from amaranth.lib import stream, wiring +from amaranth import Elaboratable, Module, Signal, C, Mux, Instance, Cat, ClockSignal, DomainRenamer, signed +from amaranth.lib import io, stream, wiring, cdc, data, fifo from amaranth.lib.wiring import Out, In, connect from board import PralinePlatform, ClockDomainGenerator -from interface import MAX586xInterface, SGPIOInterface, SPIRegisterInterface +from interface import MAX586xInterface +from interface.spi import SPIRegisterInterface from dsp.dc_block import DCBlock from dsp.round import convergent_round from util import IQSample, ClockConverter +class MCUInterface(wiring.Component): + adc_stream: In(stream.Signature(IQSample(4), always_ready=True)) + dac_stream: Out(stream.Signature(IQSample(4))) + direction: In(1) + enable: In(1) + + def __init__(self, domain="sync"): + self._domain = domain + super().__init__() + + def elaborate(self, platform): + m = Module() + + adc_stream = self.adc_stream + dac_stream = self.dac_stream + + # Determine data transfer direction. + direction = Signal() + enable = Signal() + m.d.sync += enable.eq(self.enable) + m.d.sync += direction.eq(self.direction) + transfer_from_adc = (direction == 0) + transfer_to_dac = (direction == 1) + + # SGPIO clock and data lines. + m.submodules.clk_out = clk_out = io.DDRBuffer("o", platform.request("host_clk", dir="-"), o_domain=self._domain) + m.submodules.host_io = host_io = io.DDRBuffer('io', platform.request("host_data", dir="-"), i_domain=self._domain, o_domain=self._domain) + + # State machine to control SGPIO clock and data lines. + m.d.sync += clk_out.o[0].eq(0) + m.d.sync += clk_out.o[1].eq(0) + m.d.sync += host_io.oe.eq(transfer_from_adc) + + data_to_host = Signal.like(Cat(adc_stream.p.i, adc_stream.p.q)) + assert len(data_to_host) == 8 + m.d.comb += host_io.o[0].eq(data_to_host) + m.d.comb += host_io.o[1].eq(data_to_host) + + tx_dly_write = Signal(2) + m.d.sync += tx_dly_write.eq(tx_dly_write << 1) + m.d.comb += dac_stream.payload.eq(host_io.i[1]) + m.d.comb += dac_stream.valid.eq(tx_dly_write[-1]) + + with m.FSM(): + with m.State("IDLE"): + with m.If(enable): + with m.If(transfer_from_adc & adc_stream.valid): + m.d.sync += data_to_host.eq(Cat(adc_stream.p.i, adc_stream.p.q)) + m.d.sync += clk_out.o[1].eq(1) + + with m.Elif(transfer_to_dac & dac_stream.ready): + m.d.sync += clk_out.o[0].eq(1) + m.d.sync += tx_dly_write[0].eq(1) # delayed write + + if self._domain != "sync": + m = DomainRenamer(self._domain)(m) + + return m + + +class FlowAndTriggerControl(wiring.Component): + trigger_en: In(1) + direction: Out(1) # async + enable: Out(1) # async + adc_capture: Out(1) + dac_capture: Out(1) + + def __init__(self, domain): + super().__init__() + self._domain = domain + + def elaborate(self, platform): + m = Module() + + # + # Signal synchronization and trigger logic. + # + trigger_enable = self.trigger_en + trigger_in = platform.request("trigger_in").i + trigger_out = platform.request("trigger_out").o + host_data_enable = ~platform.request("disable").i + m.d.comb += trigger_out.eq(host_data_enable) + + # Create a latch for the trigger input signal using a FPGA primitive. + trigger_in_latched = Signal() + trigger_in_reg = Instance("SB_DFFES", + i_D = 0, + i_S = trigger_in, # async set + i_E = ~host_data_enable, + i_C = ClockSignal(self._domain), + o_Q = trigger_in_latched + ) + m.submodules.trigger_in_reg = trigger_in_reg + + # Export signals for direction control and gating captures. + m.d.comb += self.direction.eq(platform.request("direction").i) + m.d.comb += self.enable.eq(host_data_enable) + + with m.If(host_data_enable): + m.d[self._domain] += self.adc_capture.eq((trigger_in_latched | ~trigger_enable) & (self.direction == 0)) + m.d[self._domain] += self.dac_capture.eq((trigger_in_latched | ~trigger_enable) & (self.direction == 1)) + with m.Else(): + m.d[self._domain] += self.adc_capture.eq(0) + m.d[self._domain] += self.dac_capture.eq(0) + + return m + + + + class IQHalfPrecisionConverter(wiring.Component): input: In(stream.Signature(IQSample(8), always_ready=True)) output: Out(stream.Signature(IQSample(4), always_ready=True)) @@ -56,18 +167,22 @@ class Top(Elaboratable): m.submodules.clkgen = ClockDomainGenerator() # Submodules. + m.submodules.flow_ctl = flow_ctl = FlowAndTriggerControl(domain="gck1") m.submodules.adcdac_intf = adcdac_intf = MAX586xInterface(bb_domain="gck1") - m.submodules.mcu_intf = mcu_intf = SGPIOInterface(sample_width=8, domain="sync") + m.submodules.mcu_intf = mcu_intf = MCUInterface(domain="sync") + m.d.comb += adcdac_intf.adc_capture.eq(flow_ctl.adc_capture) + m.d.comb += adcdac_intf.dac_capture.eq(flow_ctl.dac_capture) m.d.comb += adcdac_intf.q_invert.eq(platform.request("q_invert").i) + m.d.comb += mcu_intf.direction.eq(flow_ctl.direction) + m.d.comb += mcu_intf.enable.eq(flow_ctl.enable) rx_chain = { "dc_block": DCBlock(width=8, num_channels=2, domain="gck1"), "half_prec": DomainRenamer("gck1")(IQHalfPrecisionConverter()), - "clkconv": ClockConverter(IQSample(4), 16, "gck1", "sync"), + "clkconv": ClockConverter(IQSample(4), 4, "gck1", "sync"), } - for k,v in rx_chain.items(): - m.submodules[f"rx_{k}"] = v + m.submodules += rx_chain.values() # Connect receiver chain. last = adcdac_intf.adc_stream @@ -78,11 +193,10 @@ class Top(Elaboratable): tx_chain = { - "clkconv": ClockConverter(IQSample(4), 16, "sync", "gck1", always_ready=False), + "clkconv": ClockConverter(IQSample(4), 4, "sync", "gck1", always_ready=False), "half_prec": DomainRenamer("gck1")(IQHalfPrecisionConverterInv()), } - for k,v in tx_chain.items(): - m.submodules[f"tx_{k}"] = v + m.submodules += tx_chain.values() # Connect transmitter chain. last = mcu_intf.dac_stream @@ -99,7 +213,7 @@ class Top(Elaboratable): ctrl = spi_regs.add_register(0x01, init=0) m.d.comb += [ # Trigger enable. - mcu_intf.trigger_en .eq(ctrl[7]), + flow_ctl.trigger_en .eq(ctrl[7]), # RX settings. rx_chain["dc_block"].enable .eq(ctrl[0]), @@ -110,4 +224,4 @@ class Top(Elaboratable): if __name__ == "__main__": plat = PralinePlatform() - plat.build(Top()) + plat.build(Top_HP()) diff --git a/firmware/fpga/top/standard.py b/firmware/fpga/top/standard.py index 7f85925b..50c73df8 100644 --- a/firmware/fpga/top/standard.py +++ b/firmware/fpga/top/standard.py @@ -4,20 +4,168 @@ # Copyright (c) 2025 Great Scott Gadgets # SPDX-License-Identifier: BSD-3-Clause -from amaranth import Elaboratable, Module, Signal, Mux, DomainRenamer -from amaranth.lib import cdc -from amaranth.lib.wiring import connect +from amaranth import Elaboratable, Module, Signal, Mux, Instance, Cat, ClockSignal, DomainRenamer, EnableInserter +from amaranth.lib import io, fifo, stream, wiring, cdc +from amaranth.lib.wiring import Out, In, connect from amaranth_future import fixed from board import PralinePlatform, ClockDomainGenerator -from interface import MAX586xInterface, SGPIOInterface, SPIRegisterInterface +from interface import MAX586xInterface +from interface.spi import SPIRegisterInterface from dsp.fir import HalfBandDecimator, HalfBandInterpolator -from dsp.cic import CICInterpolator +from dsp.cic import CICDecimator, CICInterpolator from dsp.dc_block import DCBlock from dsp.quarter_shift import QuarterShift from dsp.nco import NCO -from util import ClockConverter, IQSample, StreamSkidBuffer +from util import ClockConverter, IQSample, StreamSkidBuffer, LinearFeedbackShiftRegister + + +class MCUInterface(wiring.Component): + adc_stream: In(stream.Signature(IQSample(8), always_ready=True)) + dac_stream: Out(stream.Signature(IQSample(8))) + direction: In(1) + enable: In(1) + prbs: In(1) + + def __init__(self, domain="sync"): + self._domain = domain + super().__init__() + + def elaborate(self, platform): + m = Module() + + adc_stream = self.adc_stream + dac_stream = self.dac_stream + + # Determine data transfer direction. + direction = Signal() + enable = Signal() + m.submodules.enable_cdc = cdc.FFSynchronizer(self.enable, enable, o_domain=self._domain) + m.submodules.direction_cdc = cdc.FFSynchronizer(self.direction, direction, o_domain=self._domain) + transfer_from_adc = (direction == 0) + transfer_to_dac = (direction == 1) + + # SGPIO clock and data lines. + m.submodules.clk_out = clk_out = io.DDRBuffer("o", platform.request("host_clk", dir="-"), o_domain=self._domain) + m.submodules.host_io = host_io = io.DDRBuffer('io', platform.request("host_data", dir="-"), i_domain=self._domain, o_domain=self._domain) + + # State machine to control SGPIO clock and data lines. + tx_clk_en = Signal() + rx_clk_en = Signal() + m.d.sync += clk_out.o[0].eq(tx_clk_en) + m.d.sync += clk_out.o[1].eq(rx_clk_en) + m.d.sync += host_io.oe.eq(transfer_from_adc) + + data_to_host = Signal.like(adc_stream.p) + m.d.comb += host_io.o[0].eq(data_to_host) + m.d.comb += host_io.o[1].eq(data_to_host) + + tx_dly_write = Signal(3) + host_io_prev_data = Signal(8) + m.d.sync += tx_dly_write.eq(tx_dly_write << 1) + m.d.sync += host_io_prev_data.eq(host_io.i[1]) + + # Small TX FIFO to avoid overflows from the write delay. + m.submodules.tx_fifo = tx_fifo = fifo.SyncFIFOBuffered(width=16, depth=8) + m.d.comb += [ + tx_fifo.w_data .eq(Cat(host_io_prev_data, host_io.i[1])), + tx_fifo.w_en .eq(tx_dly_write[-1]), + dac_stream.p .eq(tx_fifo.r_data), + dac_stream.valid .eq(tx_fifo.r_rdy), + tx_fifo.r_en .eq(dac_stream.ready), + ] + + # Pseudo-random binary sequence generator. + prbs_advance = Signal() + prbs_count = Signal(2) + m.submodules.prbs = prbs = EnableInserter(prbs_advance)( + LinearFeedbackShiftRegister(degree=8, taps=[8,6,5,4], init=0b10110001)) + + with m.FSM(): + with m.State("IDLE"): + m.d.comb += tx_clk_en.eq(enable & transfer_to_dac & dac_stream.ready) + m.d.comb += rx_clk_en.eq(enable & transfer_from_adc & adc_stream.valid) + + with m.If(self.prbs): + m.next = "PRBS" + with m.Elif(rx_clk_en): + m.d.sync += data_to_host.eq(adc_stream.p) + m.next = "RX_Q" + with m.Elif(tx_clk_en): + m.next = "TX_Q" + + with m.State("RX_Q"): + m.d.comb += rx_clk_en.eq(1) + m.d.sync += data_to_host.i.eq(data_to_host.q) + m.next = "IDLE" + + with m.State("TX_Q"): + m.d.comb += tx_clk_en.eq(1) + m.d.sync += tx_dly_write[0].eq(1) # delayed write + m.next = "IDLE" + + with m.State("PRBS"): + m.d.sync += host_io.oe.eq(1) + m.d.sync += data_to_host.eq(prbs.value) + m.d.comb += rx_clk_en.eq(prbs_count == 0) + m.d.comb += prbs_advance.eq(prbs_count == 0) + m.d.sync += prbs_count.eq(prbs_count + 1) + with m.If(~self.prbs): + m.next = "IDLE" + + if self._domain != "sync": + m = DomainRenamer(self._domain)(m) + + return m + + +class FlowAndTriggerControl(wiring.Component): + trigger_en: In(1) + direction: Out(1) # async + enable: Out(1) # async + adc_capture: Out(1) + dac_capture: Out(1) + + def __init__(self, domain): + super().__init__() + self._domain = domain + + def elaborate(self, platform): + m = Module() + + # + # Signal synchronization and trigger logic. + # + trigger_enable = self.trigger_en + trigger_in = platform.request("trigger_in").i + trigger_out = platform.request("trigger_out").o + host_data_enable = ~platform.request("disable").i + m.d.comb += trigger_out.eq(host_data_enable) + + # Create a latch for the trigger input signal using a special FPGA primitive. + trigger_in_latched = Signal() + trigger_in_reg = Instance("SB_DFFES", + i_D = 0, + i_S = trigger_in, # async set + i_E = ~host_data_enable, + i_C = ClockSignal(self._domain), + o_Q = trigger_in_latched + ) + m.submodules.trigger_in_reg = trigger_in_reg + + # Export signals for direction control and capture gating. + m.d.comb += self.direction.eq(platform.request("direction").i) + m.d.comb += self.enable.eq(host_data_enable) + + with m.If(host_data_enable): + m.d[self._domain] += self.adc_capture.eq((trigger_in_latched | ~trigger_enable) & (self.direction == 0)) + m.d[self._domain] += self.dac_capture.eq((trigger_in_latched | ~trigger_enable) & (self.direction == 1)) + with m.Else(): + m.d[self._domain] += self.adc_capture.eq(0) + m.d[self._domain] += self.dac_capture.eq(0) + + return m class Top(Elaboratable): @@ -28,10 +176,15 @@ class Top(Elaboratable): m.submodules.clkgen = ClockDomainGenerator() # Submodules. + m.submodules.flow_ctl = flow_ctl = FlowAndTriggerControl(domain="gck1") m.submodules.adcdac_intf = adcdac_intf = MAX586xInterface(bb_domain="gck1") - m.submodules.mcu_intf = mcu_intf = SGPIOInterface(sample_width=16, domain="sync") + m.submodules.mcu_intf = mcu_intf = MCUInterface(domain="sync") + m.d.comb += adcdac_intf.adc_capture.eq(flow_ctl.adc_capture) + m.d.comb += adcdac_intf.dac_capture.eq(flow_ctl.dac_capture) m.d.comb += adcdac_intf.q_invert.eq(platform.request("q_invert").i) + m.d.comb += mcu_intf.direction.eq(flow_ctl.direction) + m.d.comb += mcu_intf.enable.eq(flow_ctl.enable) # Half-band filter taps. taps = [-2, 0, 7, 0, -18, 0, 41, 0, -92, 0, 320, 512, 320, 0, -92, 0, 41, 0, -18, 0, 7, 0, -2] @@ -68,7 +221,7 @@ class Top(Elaboratable): "hbfir1": HalfBandDecimator(taps, **common_rx_filter_opts), # Clock domain conversion. - "clkconv": ClockConverter(IQSample(8), 8, "gck1", "sync"), + "clkconv": ClockConverter(IQSample(8), 4, "gck1", "sync"), } for k,v in rx_chain.items(): m.submodules[f"rx_{k}"] = v @@ -82,7 +235,7 @@ class Top(Elaboratable): tx_chain = { # Clock domain conversion. - "clkconv": ClockConverter(IQSample(8), 8, "sync", "gck1", always_ready=False), + "clkconv": ClockConverter(IQSample(8), 4, "sync", "gck1", always_ready=False), # Half-band interpolation stages (+ skid buffers for timing closure). "hbfir1": HalfBandInterpolator(taps, data_shape=fixed.SQ(7), @@ -95,7 +248,6 @@ class Top(Elaboratable): # CIC interpolation stage. "cic_interpolator": CICInterpolator(1, 3, (1, 2, 4, 8), 8, 8, num_channels=2, always_ready=False, domain="gck1"), - "skid4": DomainRenamer("gck1")(StreamSkidBuffer(IQSample(8), always_ready=False)), } for k,v in tx_chain.items(): m.submodules[f"tx_{k}"] = v @@ -111,7 +263,7 @@ class Top(Elaboratable): m.d.comb += [ adcdac_intf.dac_stream.p.eq(nco.output), adcdac_intf.dac_stream.valid.eq(1), - last.ready.eq(1), + tx_chain["cic_interpolator"].output.ready.eq(1), ] with m.Else(): connect(m, last, adcdac_intf.dac_stream) @@ -129,7 +281,7 @@ class Top(Elaboratable): m.d.sync += [ # Trigger enable. - mcu_intf.trigger_en .eq(ctrl[7]), + flow_ctl.trigger_en .eq(ctrl[7]), # PRBS enable. mcu_intf.prbs .eq(ctrl[6]), diff --git a/firmware/fpga/util/__init__.py b/firmware/fpga/util/__init__.py index 47e88e5e..75334121 100644 --- a/firmware/fpga/util/__init__.py +++ b/firmware/fpga/util/__init__.py @@ -35,7 +35,7 @@ class ClockConverter(wiring.Component): def elaborate(self, platform): m = Module() - m.submodules.mem = mem = fifo.AsyncFIFOBuffered( + m.submodules.mem = mem = fifo.AsyncFIFO( width=Shape.cast(self.shape).width, depth=self.depth, r_domain=self._output_domain,