commit d140293442b0a02edddafd6f5c77c101292d374e
parent c56a07483d3776294346b62851a9ec0ea65f6c5b
Author: Brian Swetland <swetland@frotz.net>
Date: Sun, 12 Feb 2012 00:57:57 -0800
cpu32: pipelined
- three stages: fetch/decode/execute | memory | writeback
- no protection against hazards between register reads and writes (yet)
- todo: stall in the presence of hazards, etc
- now uses syncsram instead of async
Diffstat:
10 files changed, 258 insertions(+), 81 deletions(-)
diff --git a/Makefile b/Makefile
@@ -1,7 +1,7 @@
# Copyright 2012, Brian Swetland. Use at your own risk.
SRC := verilog/testbench.v
-SRC += verilog/ram.v verilog/rom.v verilog/control.v
+SRC += verilog/ram.v verilog/syncram.v verilog/rom.v verilog/control.v
SRC += verilog/cpu32.v verilog/alu.v verilog/regfile.v
SRC += verilog/uart.v
SRC += verilog/library.v
diff --git a/a32.c b/a32.c
@@ -435,6 +435,7 @@ struct {
const char *fmt;
} decode[] = {
{ 0xFFFFFFFF, 0x00000000, "NOP", },
+ { 0xFFFFFFFF, 0xEEEEEEEE, "NOP", },
{ 0xFFFFFFFF, 0xFFFFFFFF, "HALT", },
{ 0xFFF00000, 0x10F00000, "MOV @B, #@s", }, // ORR Rd, Rz, #I
{ 0xFFF00000, 0x1CF00000, "MLO @B, #0x@u", }, // MLO Rd, Rz, #I
@@ -446,18 +447,18 @@ struct {
{ 0xFF000000, 0x22000000, "LW @B, [@A, #@s]", },
{ 0xFF00FFFF, 0x32000000, "SW @B, [@A]", },
{ 0xFF000000, 0x32000000, "SW @B, [@A, #@s]", },
- { 0xFFFF0000, 0x40FF0000, "B @r", },
- { 0xFFFF0000, 0x40FE0000, "BL @r", },
- { 0xFF0F0000, 0x400F0000, "BZ @A, @r", },
- { 0xFF0F0000, 0x400E0000, "BLZ @A, @r", },
- { 0xFFF0F000, 0x50F0F000, "B @B", },
- { 0xFFF0F000, 0x50F0E000, "BL @B", },
- { 0xFF00F000, 0x5000F000, "BZ @A, @B", },
- { 0xFF00F000, 0x5000E000, "BLZ @A, @B", },
- { 0xFF0F0000, 0x480F0000, "BNZ @A, @r", },
- { 0xFF0F0000, 0x480E0000, "BLNZ @A, @r", },
- { 0xFF00F000, 0x5800F000, "BNZ @A, @B", },
- { 0xFF00F000, 0x5800E000, "BLNZ @A, @B", },
+ { 0xFFFF0000, 0x4FFF0000, "B @r", },
+ { 0xFFFF0000, 0x4FFE0000, "BL @r", },
+ { 0xFF0F0000, 0x4F0F0000, "BZ @A, @r", },
+ { 0xFF0F0000, 0x4F0E0000, "BLZ @A, @r", },
+ { 0xFFF0F000, 0x6FF0F000, "B @B", },
+ { 0xFFF0F000, 0x6FF0E000, "BL @B", },
+ { 0xFF00F000, 0x6F00F000, "BZ @A, @B", },
+ { 0xFF00F000, 0x6F00E000, "BLZ @A, @B", },
+ { 0xFF0F0000, 0x5F0F0000, "BNZ @A, @r", },
+ { 0xFF0F0000, 0x5F0E0000, "BLNZ @A, @r", },
+ { 0xFF00F000, 0x7F00F000, "BNZ @A, @B", },
+ { 0xFF00F000, 0x7F00E000, "BLNZ @A, @B", },
{ 0x00000000, 0x00000000, "UNDEFINED", },
};
@@ -497,11 +498,17 @@ void assemble_line(int n, unsigned *tok, unsigned *num, char **str) {
/* blank lines are fine */
return;
case tNOP:
- emit(0x00000000);
+ emit(0xEEEEEEEE);
return;
case tMOV:
expect_register(tok[1]);
expect(tCOMMA,tok[2]);
+ if (is_register(tok[3])) {
+ /* MOV A,B -> ORR A, B, B */
+ tmp = to_register(tok[3]);
+ emit(TO_D(to_register(tok[1])) | TO_A(tmp) | TO_B(tmp));
+ return;
+ }
expect(tNUMBER,tok[3]);
if (num[3] == 0xFFFF) {
/* special case, need to use MLO */
@@ -535,13 +542,13 @@ void assemble_line(int n, unsigned *tok, unsigned *num, char **str) {
tmp = 14;
}
if (is_register(tok[1])) {
- emit(0x50F00000 | TO_D(tmp) | TO_B(to_register(tok[1])));
+ emit(0x6FF00000 | TO_D(tmp) | TO_B(to_register(tok[1])));
} else if (tok[1] == tSTRING) {
- emit(0x40F00000 | TO_B(tmp));
+ emit(0x4FF00000 | TO_B(tmp));
uselabel(str[1], PC - 1, 16);
} else if ((tok[1] == tNUMBER) || (tok[1] == tDOT)) {
if (!is_signed_16(num[1])) die("branch target out of range");
- emit(0x40F00000 | TO_B(tmp) | TO_I16(num[1]));
+ emit(0x4FF00000 | TO_B(tmp) | TO_I16(num[1]));
} else {
die("expected branch target, got %s", tnames[tok[1]]);
}
@@ -551,16 +558,16 @@ void assemble_line(int n, unsigned *tok, unsigned *num, char **str) {
case tBLNZ:
case tBLZ:
switch (tok[0]) {
- case tBZ: instr = 0x40000000; tmp = 15; break;
- case tBNZ: instr = 0x48000000; tmp = 15; break;
- case tBLZ: instr = 0x40000000; tmp = 14; break;
- case tBLNZ: instr = 0x48000000; tmp = 14; break;
+ case tBZ: instr = 0x4F000000; tmp = 15; break;
+ case tBNZ: instr = 0x5F000000; tmp = 15; break;
+ case tBLZ: instr = 0x4F000000; tmp = 14; break;
+ case tBLNZ: instr = 0x5F000000; tmp = 14; break;
}
expect_register(tok[1]);
expect(tCOMMA,tok[2]);
instr |= TO_A(to_register(tok[1]));
if (is_register(tok[3])) {
- emit(instr | 0x10000000 | TO_D(tmp) | TO_B(to_register(tok[3])));
+ emit(instr | 0x20000000 | TO_D(tmp) | TO_B(to_register(tok[3])));
} else if (tok[3] == tSTRING) {
emit(instr | TO_B(tmp));
uselabel(str[3], PC - 1, 16);
diff --git a/de0nano/de0nano.v b/de0nano/de0nano.v
@@ -45,7 +45,7 @@ cpu32 cpu(
.d_data_r(cpurdata),
.d_data_w(ramwdata),
.d_addr(ramaddr),
- .d_we(ramwe)
+ .d_data_we(ramwe)
);
// ugly hack for now
diff --git a/isa.txt b/isa.txt
@@ -21,6 +21,11 @@ Core Instruction Set
50 BLZ Rd, Ra, Rb if (Ra == 0) { Rd = PC + 4, PC = Rb }
58 BLNZ Rd, Ra, Rb if (Ra != 0) { Rd = PC + 4, PC = Rb }
+4F BLZ Rd, Ra, rel if (Ra == 0) { Rd = PC + 4, PC += I }
+5F BLNZ Rd, Ra, rel if (Ra != 0) { Rd = PC + 4, PC += I }
+6F BLZ Rd, Ra, Rb if (Ra == 0) { Rd = PC + 4, PC = Rb }
+7F BLNZ Rd, Ra, Rb if (Ra != 0) { Rd = PC + 4, PC = Rb }
+
Extended Instruction Set (tbd)
------------------------
@@ -50,7 +55,7 @@ XB SGT Rd, Ra, Rb Rd = Ra > Rb
XC MLO Rd, Ra, Rb Rd = (Ra & 0xFFFF0000) | (Rb & 0xFFFF)
XD MHI Rd, Ra, Rb Rd = (Ra & 0xFFFF) | (Rb << 16)
XE ASR Rd, Ra, Rb Rd = (Ra >>> Rb[0:4])
-XF MUL Rd, Ra, Rb Rd = Ra * Rb
+XF NOP Rd, Ra, Rb Rd = Ra
Pseudo Instructions
-------------------
@@ -58,6 +63,7 @@ Pseudo Instructions
MOV Rd, Rb OR Rd, R0, Rb
SNE Rd, Ra, Rb SUB Rd, Ra, Rb
NOT Rd, Ra XOR Rd, Ra, #-1
+NOP BLNZ Rz, Rz, 0xFFFF (0x5FFFFFFF)
Registers
---------
diff --git a/testbench.sav b/testbench.sav
@@ -1,37 +1,57 @@
[timestart] 0
-[size] 1248 600
+[size] 1260 725
[pos] -1 -1
-*-6.007232 37 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
+*-5.861536 147 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1 -1
[treeopen] testbench.
[treeopen] testbench.cpu.
@28
-testbench.clk
+testbench.cpu.clk
+@800200
+-F/D/E
@22
testbench.cpu.pc[31:0]
testbench.cpu.ir[31:0]
@28
-testbench.cpu.ctl_ram_rd
+testbench.cpu.ctl_ram_we
testbench.cpu.ctl_regs_we
-testbench.cpu.ctl_branch
+testbench.cpu.ctl_wdata_ram
+@22
+testbench.cpu.result[31:0]
+@29
+testbench.cpu.hazard_rrw
+@1000200
+-F/D/E
+@800200
+-MEM
+@22
+testbench.cpu.mem.regs_wsel[3:0]
+@28
+testbench.cpu.mem.regs_we
+testbench.cpu.mem.mem_we
+testbench.cpu.mem.wdata_ram
+@1000200
+-MEM
@800200
--ALU
+-WB
+@28
+testbench.cpu.mem.in_wdata_ram
+@22
+testbench.cpu.wb.data[31:0]
+@28
+testbench.cpu.wb.we
@22
-testbench.cpu.alu.left[31:0]
-testbench.cpu.alu.right[31:0]
-testbench.cpu.alu.out[31:0]
+testbench.cpu.wb.wsel[3:0]
@1000200
--ALU
+-WB
@800200
--MEMORY
+-RAM
@22
-testbench.cpu.d_addr[31:0]
-testbench.cpu.d_data_r[31:0]
-testbench.cpu.d_data_w[31:0]
+testbench.ram.addr[7:0]
+testbench.ram.rdata[31:0]
+testbench.ram.wdata[31:0]
@28
-testbench.cpu.d_we
+testbench.ram.we
@1000200
--MEMORY
-@23
-testbench.cpu.REGS.wdata[31:0]
+-RAM
[pattern_trace] 1
[pattern_trace] 0
diff --git a/verilog/alu.v b/verilog/alu.v
@@ -31,6 +31,6 @@ always @ (*)
4'b1100: out <= { left[31:16], right[15:0] };
4'b1101: out <= { right[15:0], left[31:16] };
4'b1110: out <= (left >>> right[4:0]);
- 4'b1111: out <= (left * right);
+ 4'b1111: out <= left;
endcase
endmodule
diff --git a/verilog/control.v b/verilog/control.v
@@ -11,7 +11,7 @@ module control (
output ctl_regs_we, // 1=write to reg file
output ctl_ram_we, // 1=write to ram
output ctl_alu_altdest, // 0=alu.daddr=opd, 1=alu.daddr=opb
- output [1:0] ctl_wdata_src, // 00=alu,01=ram,10=pc+4,11=0
+ output ctl_wdata_ram, // 0=alu, 1=ram
output ctl_branch_ind, // 0=relative branch, 1=indirect branch
output ctl_branch_taken // 0=pc=pc+4, 1=pc=branch_to
@@ -20,26 +20,29 @@ module control (
wire ctl_branch_op;
wire ctl_branch_nz;
-reg [7:0] control;
+reg [6:0] control;
always @ (*)
case (opcode)
- 4'b0000: control = 8'b00100000; // ALU Rd, Ra, Rb
- 4'b0001: control = 8'b01101000; // ALU Rd, Ra, #I
- 4'b0010: control = 8'b01101001; // LW Rd, [Ra, #I]
- 4'b0011: control = 8'b01010000; // SW Rd, [Ra, #I]
- 4'b0100: control = 8'b10101110; // B rel16
- 4'b0101: control = 8'b10100110; // B Rb
- default: control = 8'b00000000;
+ 4'b0000: control = 7'b0010000; // ALU Rd, Ra, Rb
+ 4'b0001: control = 7'b0110100; // ALU Rd, Ra, #I
+ 4'b0010: control = 7'b0110101; // LW Rd, [Ra, #I]
+ 4'b0011: control = 7'b0101000; // SW Rd, [Ra, #I]
+ 4'b0100: control = 7'b1010110; // BLZ rel16
+ 4'b0101: control = 7'b1010110; // BLNZ rel16
+ 4'b0110: control = 7'b1010010; // BLZ Rb
+ 4'b0111: control = 7'b1010010; // BLNZ Rb
+ 4'b1110: control = 7'b1100000; // NOP
+ default: control = 7'b0000000;
endcase
assign {
ctl_alu_pc, ctl_alu_imm, ctl_regs_we, ctl_ram_we,
- ctl_alu_altdest, ctl_branch_op, ctl_wdata_src
- } = control[7:0];
+ ctl_alu_altdest, ctl_branch_op, ctl_wdata_ram
+ } = control;
-assign ctl_branch_nz = opfunc[3];
-assign ctl_branch_ind = opcode[0];
+assign ctl_branch_nz = opcode[0];
+assign ctl_branch_ind = opcode[1];
assign ctl_branch_taken = (ctl_branch_op & (ctl_adata_zero != ctl_branch_nz));
endmodule
diff --git a/verilog/cpu32.v b/verilog/cpu32.v
@@ -12,7 +12,7 @@ module cpu32 (
output [31:0] d_addr,
output [31:0] d_data_w,
input [31:0] d_data_r,
- output d_we
+ output d_data_we
);
wire [31:0] ir, pc;
@@ -23,7 +23,6 @@ wire [3:0] opcode, opfunc, opsela, opselb, opseld;
wire [15:0] opimm16;
wire [31:0] adata, bdata, wdata, result;
-wire [3:0] alu_wsel;
assign opcode = ir[31:28];
assign opfunc = ir[27:24];
@@ -40,7 +39,7 @@ wire ctl_alu_imm;
wire ctl_regs_we;
wire ctl_ram_we;
wire ctl_alu_altdest;
-wire [1:0] ctl_wdata_src;
+wire ctl_wdata_ram;
wire ctl_branch_ind;
wire ctl_branch_taken;
@@ -54,7 +53,7 @@ control control(
.ctl_regs_we(ctl_regs_we),
.ctl_ram_we(ctl_ram_we),
.ctl_alu_altdest(ctl_alu_altdest),
- .ctl_wdata_src(ctl_wdata_src),
+ .ctl_wdata_ram(ctl_wdata_ram),
.ctl_branch_ind(ctl_branch_ind),
.ctl_branch_taken(ctl_branch_taken)
@@ -71,23 +70,27 @@ register #(32) PC (
.dout(pc)
);
+/* these arrive from writeback */
+wire [31:0] regs_wdata;
+wire [3:0] regs_wsel;
+wire regs_we;
+
regfile REGS (
.reset(reset),
.clk(clk),
- .we(ctl_regs_we),
- .wsel(alu_wsel), .wdata(wdata),
+ .we(regs_we),
+ .wsel(regs_wsel), .wdata(regs_wdata),
.asel(opsela), .adata(adata),
.bsel(opselb), .bdata(bdata)
);
-mux4 #(32) mux_wdata_src(
- .sel(ctl_wdata_src),
- .in0(result),
- .in1(d_data_r),
- .in2(pc_plus_4),
- .in3(32'b0),
- .out(wdata)
- );
+// attempt to identify hazards
+wire hazard_rrw;
+assign hazard_rrw = (((regs_wsel == opsela) | (regs_wsel == opselb)) & regs_we);
+
+assign i_addr = pc;
+assign ir = i_data;
+//assign ir = (hazard_rrw ? 32'hEEEEEEEE : i_data);
assign pc_plus_4 = (pc + 32'h4);
@@ -106,9 +109,6 @@ mux2 #(32) mux_pc_source(
.out(next_pc)
);
-assign i_addr = pc;
-assign ir = i_data;
-
wire [31:0] ainput;
wire [31:0] binput;
@@ -126,11 +126,13 @@ mux2 #(32) mux_alu_right(
.out(binput)
);
+wire [3:0] ctl_wsel;
+
mux2 #(4) alu_wsel_mux(
.sel(ctl_alu_altdest),
.in0(opseld),
.in1(opselb),
- .out(alu_wsel)
+ .out(ctl_wsel)
);
alu alu(
@@ -140,9 +142,126 @@ alu alu(
.out(result)
);
-// SW operation always writes Rb (aka Rd)
-assign d_addr = result;
-assign d_data_w = bdata;
-assign d_we = ctl_ram_we;
+wire [31:0] mem_data;
+wire [3:0] mem_wsel;
+wire mem_we;
+
+memory mem(
+ .clk(clk),
+
+ .in_alu_data(result),
+ .in_reg_data(bdata),
+
+ .in_mem_we(ctl_ram_we),
+ .in_regs_we(ctl_regs_we),
+ .in_regs_wsel(ctl_wsel),
+ .in_wdata_ram(ctl_wdata_ram),
+
+ .out_data(mem_data),
+ .out_wsel(mem_wsel),
+ .out_we(mem_we),
+
+ .d_addr(d_addr),
+ .d_data_r(d_data_r),
+ .d_data_w(d_data_w),
+ .d_data_we(d_data_we)
+ );
+
+writeback wb(
+ .clk(clk),
+
+ .in_data(mem_data),
+ .in_wsel(mem_wsel),
+ .in_we(mem_we),
+
+ .out_we(regs_we),
+ .out_wsel(regs_wsel),
+ .out_data(regs_wdata)
+ );
endmodule
+
+
+module memory(
+ input clk,
+
+ /* interface to sync sram */
+ output [31:0] d_addr,
+ input [31:0] d_data_r,
+ output [31:0] d_data_w,
+ output d_data_we,
+
+ /* interface to processor core */
+ input [31:0] in_alu_data,
+ input [31:0] in_reg_data,
+
+ input in_mem_we,
+ input in_regs_we,
+ input [3:0] in_regs_wsel,
+ input in_wdata_ram,
+
+ output [31:0] out_data,
+ output [3:0] out_wsel,
+ output out_we
+ );
+
+ reg [31:0] alu_data;
+ reg [31:0] reg_data;
+ reg mem_we;
+ reg regs_we;
+ reg [3:0] regs_wsel;
+ reg wdata_ram;
+
+ always @(posedge clk) begin
+ alu_data <= in_alu_data;
+ reg_data <= in_reg_data;
+ mem_we <= in_mem_we;
+ regs_we <= in_regs_we;
+ regs_wsel <= in_regs_wsel;
+ wdata_ram <= in_wdata_ram;
+ end
+
+ assign d_addr = in_alu_data;
+ assign d_data_w = in_reg_data;
+ assign d_data_we = in_mem_we;
+
+ mux2 #(32) mux_data(
+ .sel(wdata_ram),
+ .in0(alu_data),
+ .in1(d_data_r),
+ .out(out_data)
+ );
+
+ assign out_wsel = regs_wsel;
+ assign out_we = regs_we;
+endmodule
+
+module writeback(
+ input clk,
+
+ input [31:0] in_data,
+ input [3:0] in_wsel,
+ input in_we,
+
+ output out_we,
+ output [3:0] out_wsel,
+ output [31:0] out_data
+ );
+
+ reg [31:0] data;
+ reg [3:0] wsel;
+ reg we;
+
+ always @(posedge clk) begin
+ data <= in_data;
+ wsel <= in_wsel;
+ we <= in_we;
+ end
+
+ assign out_we = we;
+ assign out_wsel = wsel;
+ assign out_data = data;
+endmodule
+
+
+
diff --git a/verilog/syncram.v b/verilog/syncram.v
@@ -0,0 +1,22 @@
+// RAM - Does not instantiate optimally on Altera FPGAs
+//
+// Copyright 2009, Brian Swetland. Use at your own risk.
+
+`timescale 1ns/1ns
+
+module syncram #(parameter DWIDTH=16, parameter AWIDTH=3) (
+ input clk, input we,
+ input [AWIDTH-1:0] addr,
+ input [DWIDTH-1:0] wdata,
+ output reg [DWIDTH-1:0] rdata
+ );
+
+reg [DWIDTH-1:0] R[0:2**AWIDTH-1];
+
+always @ (posedge clk) begin
+ if (we)
+ R[addr] <= wdata;
+ rdata <= R[addr];
+end
+
+endmodule
diff --git a/verilog/testbench.v b/verilog/testbench.v
@@ -31,7 +31,7 @@ cpu32 cpu(
// .d_data_r({24'b0,urdata}),
.d_data_w(ramwdata),
.d_addr(ramaddr),
- .d_we(ramwe)
+ .d_data_we(ramwe)
);
rom #(32,8) rom(
@@ -39,7 +39,7 @@ rom #(32,8) rom(
.data(romdata)
);
-ram #(32,8) ram(
+syncram #(32,8) ram(
.clk(clk),
.addr(ramaddr[9:2]),
.rdata(ramrdata),
@@ -72,7 +72,7 @@ initial begin
$dumpvars(0,testbench);
end
-initial #10000 $finish;
+initial #1000 $finish;
always @(posedge clk) begin