pluto_hdl_adi/library/util_hbm/util_hbm.v

570 lines
20 KiB
Verilog

// ***************************************************************************
// ***************************************************************************
// Copyright (C) 2022-2023 Analog Devices, Inc. All rights reserved.
//
// In this HDL repository, there are many different and unique modules, consisting
// of various HDL (Verilog or VHDL) components. The individual modules are
// developed independently, and may be accompanied by separate and unique license
// terms.
//
// The user should read each of these license terms, and understand the
// freedoms and responsibilities that he or she has by using this source/core.
//
// This core is distributed in the hope that it will be useful, but WITHOUT ANY
// WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR
// A PARTICULAR PURPOSE.
//
// Redistribution and use of source or resulting binaries, with or without modification
// of this file, are permitted under one of the following two license terms:
//
// 1. The GNU General Public License version 2 as published by the
// Free Software Foundation, which can be found in the top level directory
// of this repository (LICENSE_GPL2), and also online at:
// <https://www.gnu.org/licenses/old-licenses/gpl-2.0.html>
//
// OR
//
// 2. An ADI specific BSD license, which can be found in the top level directory
// of this repository (LICENSE_ADIBSD), and also on-line at:
// https://github.com/analogdevicesinc/hdl/blob/master/LICENSE_ADIBSD
// This will allow to generate bit files and not release the source code,
// as long as it attaches to an ADI device.
//
// ***************************************************************************
// ***************************************************************************
// This IP serves as storage interfacing element for external memories like
// HBM or DDR4 which have AXI3 or AXI4 data interfaces.
//
// The core leverages the axi_dmac as building blocks by merging an array of
// simplex DMA channels into duplex AXI channels. The core will split the
// incoming data from the source AXIS interface to multiple AXI channels,
// and in the read phase will merge the multiple AXI channels into a single
// AXIS destination interface.
// The number of duplex channels is set by syntheses parameter and must be
// set with the ratio of AXIS and AXI3/4 interface.
//
// Underflow or Overflow conditions are reported back to the data offload
// through the control/status interface.
//
// Constraints:
// min(SRC_DATA_WIDTH,DST_DATA_WIDTH) / NUM_M >= 8
// In case multiple AXI channels are used the source and destination AXIS
// interfaces widths must match.
`timescale 1ns/100ps
module util_hbm #(
parameter TX_RX_N = 1,
parameter SRC_DATA_WIDTH = 512,
parameter DST_DATA_WIDTH = 512,
parameter LENGTH_WIDTH = 32,
// Memory interface parameters
parameter AXI_PROTOCOL = 0, // 0 - AXI4 ; 1 - AXI3
parameter AXI_DATA_WIDTH = 256,
parameter AXI_ADDR_WIDTH = 32,
parameter MEM_TYPE = 2, // 1 - DDR ; 2 - HBM
// This will size the storage per master where each segment is 256MB
parameter HBM_SEGMENTS_PER_MASTER = 4,
parameter HBM_SEGMENT_INDEX = 0,
// DDR parameters
parameter DDR_BASE_ADDDRESS = 0,
// Number of AXI masters
parameter NUM_M = 2,
// Data mover parameters
parameter SRC_FIFO_SIZE = 8, // In AXI bursts
parameter DST_FIFO_SIZE = 8
) (
input wr_request_enable,
input wr_request_valid,
output wr_request_ready,
input [LENGTH_WIDTH-1:0] wr_request_length,
output [LENGTH_WIDTH-1:0] wr_response_measured_length,
output reg wr_response_eot = 1'b0,
output wr_overflow,
input rd_request_enable,
input rd_request_valid,
output rd_request_ready,
input [LENGTH_WIDTH-1:0] rd_request_length,
output reg rd_response_eot = 1'b0,
output rd_underflow,
// Slave streaming AXI interface
input s_axis_aclk,
input s_axis_aresetn,
output s_axis_ready,
input s_axis_valid,
input [SRC_DATA_WIDTH-1:0] s_axis_data,
input [SRC_DATA_WIDTH/8-1:0] s_axis_strb,
input [SRC_DATA_WIDTH/8-1:0] s_axis_keep,
input [0:0] s_axis_user,
input s_axis_last,
// Master streaming AXI interface
input m_axis_aclk,
input m_axis_aresetn,
input m_axis_ready,
output m_axis_valid,
output [DST_DATA_WIDTH-1:0] m_axis_data,
output [DST_DATA_WIDTH/8-1:0] m_axis_strb,
output [DST_DATA_WIDTH/8-1:0] m_axis_keep,
output [0:0] m_axis_user,
output m_axis_last,
// Master AXI3 interface
input m_axi_aclk,
input m_axi_aresetn,
// Write address
output [NUM_M*AXI_ADDR_WIDTH-1:0] m_axi_awaddr,
output [NUM_M*(8-(4*AXI_PROTOCOL))-1:0] m_axi_awlen,
output [NUM_M*3-1:0] m_axi_awsize,
output [NUM_M*2-1:0] m_axi_awburst,
output [NUM_M-1:0] m_axi_awvalid,
input [NUM_M-1:0] m_axi_awready,
// Write data
output [NUM_M*AXI_DATA_WIDTH-1:0] m_axi_wdata,
output [NUM_M*(AXI_DATA_WIDTH/8)-1:0] m_axi_wstrb,
input [NUM_M-1:0] m_axi_wready,
output [NUM_M-1:0] m_axi_wvalid,
output [NUM_M-1:0] m_axi_wlast,
// Write response
input [NUM_M-1:0] m_axi_bvalid,
input [NUM_M*2-1:0] m_axi_bresp,
output [NUM_M-1:0] m_axi_bready,
// Read address
input [NUM_M-1:0] m_axi_arready,
output [NUM_M-1:0] m_axi_arvalid,
output [NUM_M*AXI_ADDR_WIDTH-1:0] m_axi_araddr,
output [NUM_M*(8-(4*AXI_PROTOCOL))-1:0] m_axi_arlen,
output [NUM_M*3-1:0] m_axi_arsize,
output [NUM_M*2-1:0] m_axi_arburst,
// Read data and response
input [NUM_M*AXI_DATA_WIDTH-1:0] m_axi_rdata,
output [NUM_M-1:0] m_axi_rready,
input [NUM_M-1:0] m_axi_rvalid,
input [NUM_M*2-1:0] m_axi_rresp,
input [NUM_M-1:0] m_axi_rlast
);
localparam DMA_TYPE_AXI_MM = 0;
localparam DMA_TYPE_AXI_STREAM = 1;
localparam DMA_TYPE_FIFO = 2;
localparam SRC_DATA_WIDTH_PER_M = SRC_DATA_WIDTH / NUM_M;
localparam DST_DATA_WIDTH_PER_M = DST_DATA_WIDTH / NUM_M;
localparam AXI_BYTES_PER_BEAT_WIDTH = $clog2(AXI_DATA_WIDTH/8);
localparam SRC_BYTES_PER_BEAT_WIDTH = $clog2(SRC_DATA_WIDTH_PER_M/8);
localparam DST_BYTES_PER_BEAT_WIDTH = $clog2(DST_DATA_WIDTH_PER_M/8);
// Size bursts to the max possible size
// AXI 3 1 burst is 16 beats
// AXI 4 1 burst is 256 beats
// Limit one burst to 4096 bytes
localparam MAX_BYTES_PER_BURST = (AXI_PROTOCOL ? 16 : 256) * AXI_DATA_WIDTH/8;
localparam MAX_BYTES_PER_BURST_LMT = MAX_BYTES_PER_BURST >= 4096 ? 4096 :
MAX_BYTES_PER_BURST;
localparam BYTES_PER_BURST_WIDTH = $clog2(MAX_BYTES_PER_BURST_LMT);
localparam AXI_ALEN = (8-(4*AXI_PROTOCOL));
localparam NUM_M_LOG2 = $clog2(NUM_M);
genvar i;
wire [NUM_M-1:0] wr_request_ready_loc;
wire [NUM_M-1:0] rd_request_ready_loc;
wire [NUM_M-1:0] wr_request_eot_loc;
wire [NUM_M-1:0] rd_request_eot_loc;
wire [NUM_M-1:0] rd_response_valid_loc;
wire [NUM_M-1:0] wr_response_valid_loc;
wire wr_eot_pending_all;
wire rd_eot_pending_all;
assign wr_request_ready = &wr_request_ready_loc;
assign rd_request_ready = &rd_request_ready_loc;
// Aggregate end of transfer from all masters
reg [NUM_M-1:0] wr_eot_pending = {NUM_M{1'b0}};
reg [NUM_M-1:0] rd_eot_pending = {NUM_M{1'b0}};
assign wr_eot_pending_all = &wr_eot_pending;
assign rd_eot_pending_all = &rd_eot_pending;
wire [NUM_M-1:0] s_axis_ready_loc;
assign s_axis_ready = &s_axis_ready_loc;
wire [NUM_M-1:0] m_axis_last_loc;
assign m_axis_last = &m_axis_last_loc;
wire [NUM_M-1:0] m_axis_valid_loc;
assign m_axis_valid = &m_axis_valid_loc;
wire [NUM_M-1:0] wr_response_ready_loc;
wire [NUM_M-1:0] rd_response_ready_loc;
wire [NUM_M-1:0] wr_overflow_loc;
wire [NUM_M-1:0] rd_underflow_loc;
// Measure stored data in case transfer is shorter than programmed,
// do the measurement only with the first master, all others should be
// similar.
localparam LW_PER_M = LENGTH_WIDTH-NUM_M_LOG2;
wire [NUM_M*BYTES_PER_BURST_WIDTH-1:0] wr_measured_burst_length;
reg [LW_PER_M-1:0] wr_response_measured_length_per_m = 'h0;
always @(posedge s_axis_aclk) begin
if (wr_request_enable == 1'b0) begin
wr_response_measured_length_per_m <= {LW_PER_M{1'h0}};
end else if (wr_response_valid_loc[0] == 1'b1 && wr_response_ready_loc[0] == 1'b1) begin
wr_response_measured_length_per_m <= wr_response_measured_length_per_m +
{{LW_PER_M-BYTES_PER_BURST_WIDTH{1'b0}},wr_measured_burst_length[BYTES_PER_BURST_WIDTH-1:0]} +
{{LW_PER_M-1{1'b0}},~wr_request_eot_loc[0]};
end else if (wr_response_eot == 1'b1) begin
wr_response_measured_length_per_m <= {LW_PER_M{1'h0}};
end
end
assign wr_response_measured_length = {wr_response_measured_length_per_m,{NUM_M_LOG2{1'b1}}};
always @(posedge s_axis_aclk) begin
wr_response_eot <= wr_eot_pending_all;
end
always @(posedge m_axis_aclk) begin
rd_response_eot <= rd_eot_pending_all;
end
generate
for (i = 0; i < NUM_M; i=i+1) begin
wire [11:0] rd_dbg_status;
wire rd_needs_reset;
wire s_axis_xfer_req;
wire m_axis_xfer_req;
reg rd_needs_reset_d = 1'b0;
// 2Gb (256MB) per segment
localparam ADDR_OFFSET = (MEM_TYPE == 1) ? DDR_BASE_ADDDRESS :
(HBM_SEGMENT_INDEX+i) * HBM_SEGMENTS_PER_MASTER * 256 * 1024 * 1024 ;
always @(posedge s_axis_aclk) begin
if (wr_eot_pending_all) begin
wr_eot_pending[i] <= 1'b0;
end else if (wr_request_eot_loc[i] & wr_response_valid_loc[i]) begin
wr_eot_pending[i] <= 1'b1;
end
end
// For last burst wait until all masters are done
assign wr_response_ready_loc[i] = wr_request_eot_loc[i] ? wr_eot_pending_all : wr_response_valid_loc[i];
// Overflow whenever s_axis_ready deasserts during capture (RX_PATH)
assign wr_overflow_loc[i] = TX_RX_N[0] ? 1'b0 : s_axis_xfer_req & ~s_axis_ready_loc[i];
// AXIS to AXI3
axi_dmac_transfer #(
.DMA_DATA_WIDTH_SRC(SRC_DATA_WIDTH_PER_M),
.DMA_DATA_WIDTH_DEST(AXI_DATA_WIDTH),
.DMA_LENGTH_WIDTH(LENGTH_WIDTH),
.DMA_LENGTH_ALIGN(SRC_BYTES_PER_BEAT_WIDTH),
.BYTES_PER_BEAT_WIDTH_DEST(AXI_BYTES_PER_BEAT_WIDTH),
.BYTES_PER_BEAT_WIDTH_SRC(SRC_BYTES_PER_BEAT_WIDTH),
.BYTES_PER_BURST_WIDTH(BYTES_PER_BURST_WIDTH),
.DMA_TYPE_DEST(DMA_TYPE_AXI_MM),
.DMA_TYPE_SRC(DMA_TYPE_AXI_STREAM),
.DMA_AXI_ADDR_WIDTH(AXI_ADDR_WIDTH),
.DMA_2D_TRANSFER(1'b0),
.ASYNC_CLK_REQ_SRC(0),
.ASYNC_CLK_SRC_DEST(1),
.ASYNC_CLK_DEST_REQ(1),
.AXI_SLICE_DEST(1),
.AXI_SLICE_SRC(1),
.MAX_BYTES_PER_BURST(MAX_BYTES_PER_BURST_LMT),
.FIFO_SIZE(SRC_FIFO_SIZE),
.ID_WIDTH($clog2(SRC_FIFO_SIZE)),
.AXI_LENGTH_WIDTH_SRC(8-(4*AXI_PROTOCOL)),
.AXI_LENGTH_WIDTH_DEST(8-(4*AXI_PROTOCOL)),
.ENABLE_DIAGNOSTICS_IF(0),
.ALLOW_ASYM_MEM(1)
) i_wr_transfer (
.ctrl_clk(s_axis_aclk),
.ctrl_resetn(s_axis_aresetn),
// Control interface
.ctrl_enable(wr_request_enable),
.ctrl_pause(1'b0),
.req_valid(wr_request_valid),
.req_ready(wr_request_ready_loc[i]),
.req_dest_address(ADDR_OFFSET[AXI_ADDR_WIDTH-1:AXI_BYTES_PER_BEAT_WIDTH]),
.req_src_address('h0),
.req_x_length(wr_request_length >> NUM_M_LOG2),
.req_y_length(0),
.req_dest_stride(0),
.req_src_stride(0),
.req_sync_transfer_start(1'b0),
.req_last(1'b1),
.req_eot(wr_request_eot_loc[i]),
.req_measured_burst_length(wr_measured_burst_length[BYTES_PER_BURST_WIDTH*i+:BYTES_PER_BURST_WIDTH]),
.req_response_partial(),
.req_response_valid(wr_response_valid_loc[i]),
.req_response_ready(wr_response_ready_loc[i]),
.m_dest_axi_aclk(m_axi_aclk),
.m_dest_axi_aresetn(m_axi_aresetn),
.m_src_axi_aclk(1'b0),
.m_src_axi_aresetn(1'b0),
.m_axi_awaddr(m_axi_awaddr[AXI_ADDR_WIDTH*i+:AXI_ADDR_WIDTH]),
.m_axi_awlen(m_axi_awlen[AXI_ALEN*i+:AXI_ALEN]),
.m_axi_awsize(m_axi_awsize[3*i+:3]),
.m_axi_awburst(m_axi_awburst[2*i+:2]),
.m_axi_awprot(),
.m_axi_awcache(),
.m_axi_awvalid(m_axi_awvalid[i]),
.m_axi_awready(m_axi_awready[i]),
.m_axi_wdata(m_axi_wdata[AXI_DATA_WIDTH*i+:AXI_DATA_WIDTH]),
.m_axi_wstrb(m_axi_wstrb[(AXI_DATA_WIDTH/8)*i+:(AXI_DATA_WIDTH/8)]),
.m_axi_wready(m_axi_wready[i]),
.m_axi_wvalid(m_axi_wvalid[i]),
.m_axi_wlast(m_axi_wlast[i]),
.m_axi_bvalid(m_axi_bvalid[i]),
.m_axi_bresp(m_axi_bresp[2*i+:2]),
.m_axi_bready(m_axi_bready[i]),
.m_axi_arready(),
.m_axi_arvalid(),
.m_axi_araddr(),
.m_axi_arlen(),
.m_axi_arsize(),
.m_axi_arburst(),
.m_axi_arprot(),
.m_axi_arcache(),
.m_axi_rdata(),
.m_axi_rready(),
.m_axi_rvalid(),
.m_axi_rlast(),
.m_axi_rresp(),
.s_axis_aclk(s_axis_aclk),
.s_axis_ready(s_axis_ready_loc[i]),
.s_axis_valid(s_axis_valid),
.s_axis_data(s_axis_data[SRC_DATA_WIDTH_PER_M*i+:SRC_DATA_WIDTH_PER_M]),
.s_axis_user(s_axis_user),
.s_axis_last(s_axis_last),
.s_axis_xfer_req(s_axis_xfer_req),
.m_axis_aclk(1'b0),
.m_axis_ready(1'b1),
.m_axis_valid(),
.m_axis_data(),
.m_axis_last(),
.m_axis_xfer_req(),
.fifo_wr_clk(1'b0),
.fifo_wr_en(1'b0),
.fifo_wr_din('b0),
.fifo_wr_overflow(),
.fifo_wr_sync(),
.fifo_wr_xfer_req(),
.fifo_rd_clk(1'b0),
.fifo_rd_en(1'b0),
.fifo_rd_valid(),
.fifo_rd_dout(),
.fifo_rd_underflow(),
.fifo_rd_xfer_req(),
// DBG
.dbg_dest_request_id(),
.dbg_dest_address_id(),
.dbg_dest_data_id(),
.dbg_dest_response_id(),
.dbg_src_request_id(),
.dbg_src_address_id(),
.dbg_src_data_id(),
.dbg_src_response_id(),
.dbg_status(),
.dest_diag_level_bursts());
always @(posedge m_axis_aclk) begin
rd_needs_reset_d <= rd_needs_reset;
end
// Generate an end of transfer at the end of flush marked by rd_needs_reset
always @(posedge m_axis_aclk) begin
if (rd_eot_pending_all) begin
rd_eot_pending[i] <= 1'b0;
end else if ((rd_request_eot_loc[i] & rd_response_valid_loc[i]) ||
(~rd_needs_reset & rd_needs_reset_d)) begin
rd_eot_pending[i] <= 1'b1;
end
end
assign rd_response_ready_loc[i] = rd_request_eot_loc[i] ? rd_eot_pending_all : rd_response_valid_loc[i];
// Underflow whenever m_axis_valid deasserts during play (TX_PATH)
assign rd_underflow_loc[i] = ~TX_RX_N[0] ? 1'b0 : m_axis_xfer_req & m_axis_ready & ~m_axis_valid_loc[i];
// AXI3 to MAXIS
axi_dmac_transfer #(
.DMA_DATA_WIDTH_SRC(AXI_DATA_WIDTH),
.DMA_DATA_WIDTH_DEST(DST_DATA_WIDTH_PER_M),
.DMA_LENGTH_WIDTH(LENGTH_WIDTH),
.DMA_LENGTH_ALIGN(DST_BYTES_PER_BEAT_WIDTH),
.BYTES_PER_BEAT_WIDTH_DEST(DST_BYTES_PER_BEAT_WIDTH),
.BYTES_PER_BEAT_WIDTH_SRC(AXI_BYTES_PER_BEAT_WIDTH),
.BYTES_PER_BURST_WIDTH(BYTES_PER_BURST_WIDTH),
.DMA_TYPE_DEST(DMA_TYPE_AXI_STREAM),
.DMA_TYPE_SRC(DMA_TYPE_AXI_MM),
.DMA_AXI_ADDR_WIDTH(AXI_ADDR_WIDTH),
.DMA_2D_TRANSFER(1'b0),
.ASYNC_CLK_REQ_SRC(1),
.ASYNC_CLK_SRC_DEST(1),
.ASYNC_CLK_DEST_REQ(0),
.AXI_SLICE_DEST(1),
.AXI_SLICE_SRC(1),
.MAX_BYTES_PER_BURST(MAX_BYTES_PER_BURST_LMT),
.FIFO_SIZE(DST_FIFO_SIZE),
.ID_WIDTH($clog2(DST_FIFO_SIZE)),
.AXI_LENGTH_WIDTH_SRC(8-(4*AXI_PROTOCOL)),
.AXI_LENGTH_WIDTH_DEST(8-(4*AXI_PROTOCOL)),
.ENABLE_DIAGNOSTICS_IF(0),
.ALLOW_ASYM_MEM(1)
) i_rd_transfer (
.ctrl_clk(m_axis_aclk),
.ctrl_resetn(m_axis_aresetn),
// Control interface
.ctrl_enable(rd_request_enable),
.ctrl_pause(1'b0),
.req_valid(rd_request_valid),
.req_ready(rd_request_ready_loc[i]),
.req_dest_address(0),
.req_src_address(ADDR_OFFSET[AXI_ADDR_WIDTH-1:AXI_BYTES_PER_BEAT_WIDTH]),
.req_x_length(rd_request_length >> NUM_M_LOG2),
.req_y_length(0),
.req_dest_stride(0),
.req_src_stride(0),
.req_sync_transfer_start(1'b0),
.req_last(1'b1),
.req_eot(rd_request_eot_loc[i]),
.req_measured_burst_length(),
.req_response_partial(),
.req_response_valid(rd_response_valid_loc[i]),
.req_response_ready(rd_response_ready_loc[i]),
.m_dest_axi_aclk(1'b0),
.m_dest_axi_aresetn(1'b0),
.m_src_axi_aclk(m_axi_aclk),
.m_src_axi_aresetn(m_axi_aresetn),
.m_axi_awaddr(),
.m_axi_awlen(),
.m_axi_awsize(),
.m_axi_awburst(),
.m_axi_awprot(),
.m_axi_awcache(),
.m_axi_awvalid(),
.m_axi_awready(1'b1),
.m_axi_wdata(),
.m_axi_wstrb(),
.m_axi_wready(1'b1),
.m_axi_wvalid(),
.m_axi_wlast(),
.m_axi_bvalid(1'b0),
.m_axi_bresp(),
.m_axi_bready(),
.m_axi_arready(m_axi_arready[i]),
.m_axi_arvalid(m_axi_arvalid[i]),
.m_axi_araddr(m_axi_araddr[AXI_ADDR_WIDTH*i+:AXI_ADDR_WIDTH]),
.m_axi_arlen(m_axi_arlen[AXI_ALEN*i+:AXI_ALEN]),
.m_axi_arsize(m_axi_arsize[3*i+:3]),
.m_axi_arburst(m_axi_arburst[2*i+:2]),
.m_axi_arprot(),
.m_axi_arcache(),
.m_axi_rdata(m_axi_rdata[AXI_DATA_WIDTH*i+:AXI_DATA_WIDTH]),
.m_axi_rready(m_axi_rready[i]),
.m_axi_rvalid(m_axi_rvalid[i]),
.m_axi_rlast(m_axi_rlast[i]),
.m_axi_rresp(m_axi_rresp[2*i+:2]),
.s_axis_aclk(1'b0),
.s_axis_ready(),
.s_axis_valid(1'b0),
.s_axis_data(),
.s_axis_user(),
.s_axis_last(),
.s_axis_xfer_req(),
.m_axis_aclk(m_axis_aclk),
.m_axis_ready((m_axis_ready & m_axis_valid) | rd_needs_reset),
.m_axis_valid(m_axis_valid_loc[i]),
.m_axis_data(m_axis_data[DST_DATA_WIDTH_PER_M*i+:DST_DATA_WIDTH_PER_M]),
.m_axis_last(m_axis_last_loc[i]),
.m_axis_xfer_req(m_axis_xfer_req),
.fifo_wr_clk(1'b0),
.fifo_wr_en(1'b0),
.fifo_wr_din('b0),
.fifo_wr_overflow(),
.fifo_wr_sync(),
.fifo_wr_xfer_req(),
.fifo_rd_clk(1'b0),
.fifo_rd_en(1'b0),
.fifo_rd_valid(),
.fifo_rd_dout(),
.fifo_rd_underflow(),
.fifo_rd_xfer_req(),
// DBG
.dbg_dest_request_id(),
.dbg_dest_address_id(),
.dbg_dest_data_id(),
.dbg_dest_response_id(),
.dbg_src_request_id(),
.dbg_src_address_id(),
.dbg_src_data_id(),
.dbg_src_response_id(),
.dbg_status(rd_dbg_status),
.dest_diag_level_bursts());
assign rd_needs_reset = rd_dbg_status[11];
end
endgenerate
assign wr_overflow = |wr_overflow_loc;
assign rd_underflow = |rd_underflow_loc;
endmodule