2020-08-26

ReuseAllocations

We're getting closer and closer to a ReuseAllocations.

For reference, here is the graph and coloring for psum.fut:

graph = [(mem_764, mem_768), (mem_768, mem_772)]

coloring = [(mem_764, 0), (mem_768, 1), (mem_772, 0)]

There are two colors, but for some reason, my optimiseKernel produces three colors?

Eh, there's something strange going on when I call analyseSegOps directly. By hoisting the interference graph analysis, I seemed to fix the problem. Is this a long-term solution though? We need the interference graphs to be per-kernel. I guess they are so now.

I now have… something:

 1: #[incremental_flattening(only_intra)]
 2: let {i64 binop_x_774} = sext i32 impl₀_245 to i64
 3: #[incremental_flattening(only_intra)]
 4: let {i64 binop_y_775} = sext i32 impl₁_246 to i64
 5: #[incremental_flattening(only_intra)]
 6: let {i64 binop_x_776} = mul_nw64(binop_x_774, binop_y_775)
 7: #[incremental_flattening(only_intra)]
 8: let {i64 bytes_773} = mul_nw64(4i64, binop_x_776)
 9: #[incremental_flattening(only_intra)]
10: let {mem mem_777} =
11:   alloc(bytes_773)
12: let {i64 binop_x_763} = binop_y_775
13: let {i64 bytes_762} = mul_nw64(4i64, binop_y_775)
14: let {i64 binop_x_767} = binop_y_775
15: let {i64 bytes_766} = bytes_762
16: let {i64 binop_x_771} = binop_y_775
17: let {i64 bytes_770} = bytes_762
18: #[incremental_flattening(only_intra)]
19: -- res_408 : [impl₀_245][impl₁_246]i32@@mem_777->
20: -- {base: [impl₀_245, impl₁_246]; contiguous: True; LMADs: [{offset: 0i32;
21: --                                                           strides: [impl₁_246, 1i32];
22: --                                                           rotates: [0i32, 0i32];
23: --                                                           shape: [impl₀_245, impl₁_246];
24: --                                                           permutation: [0, 1];
25: --                                                           monotonicity: [Inc, Inc]}]}
26: let {[impl₀_245][impl₁_246]i32 res_408} =
27:   segmap_group
28:   (#groups=impl₀_245; groupsize=impl₁_246)
29:   (gtid_292 < impl₀_245) (~phys_tid_305) : {[impl₁_246]i32} {
30:     let {mem color_778} =
31:       alloc(bytes_762)
32:     let {mem color_779} =
33:       alloc(bytes_762)
34:     let {mem@local mem_764} = color_778
35:     -- resarr0_415 : [impl₁_246]i32@@mem_764->
36:     -- {base: [impl₁_246]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32];
37:     --                                                rotates: [0i32];
38:     --                                                shape: [impl₁_246];
39:     --                                                permutation: [0];
40:     --                                                monotonicity: [Inc]}]}
41:     let {[impl₁_246]i32 resarr0_415} =
42:       segscan_thread
43:       (#groups=impl₀_245; groupsize=impl₁_246)
44:       ({{0i32},
45:         [],
46:         fn {i32} (i32 x_416, i32 x_417) =>
47:           let {i32 res_418} = add32(x_416, x_417)
48:           in {res_418}})
49:       (gtid_295 < impl₁_246) (~phys_tid_296) : {i32} {
50:         let {i32 x_419} = xss_247[gtid_292, gtid_295]
51:         return {returns x_419}
52:       }
53:     let {mem@local mem_768} = color_779
54:     -- resarr0_425 : [impl₁_246]i32@@mem_768->
55:     -- {base: [impl₁_246]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32];
56:     --                                                rotates: [0i32];
57:     --                                                shape: [impl₁_246];
58:     --                                                permutation: [0];
59:     --                                                monotonicity: [Inc]}]}
60:     let {[impl₁_246]i32 resarr0_425} =
61:       segscan_thread
62:       (#groups=impl₀_245; groupsize=impl₁_246)
63:       ({{0i32},
64:         [],
65:         fn {i32} (i32 x_426, i32 x_427) =>
66:           let {i32 res_428} = add32(x_426, x_427)
67:           in {res_428}})
68:       (gtid_297 < impl₁_246) (~phys_tid_298) : {i32} {
69:         let {i32 x_429} = resarr0_415[gtid_297]
70:         return {returns x_429}
71:       }
72:     let {mem@local mem_772} = color_778
73:     -- resarr0_434 : [impl₁_246]i32@@mem_772->
74:     -- {base: [impl₁_246]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32];
75:     --                                                rotates: [0i32];
76:     --                                                shape: [impl₁_246];
77:     --                                                permutation: [0];
78:     --                                                monotonicity: [Inc]}]}
79:     let {[impl₁_246]i32 resarr0_434} =
80:       segscan_thread
81:       (#groups=impl₀_245; groupsize=impl₁_246)
82:       ({{0i32},
83:         [],
84:         fn {i32} (i32 x_435, i32 x_436) =>
85:           let {i32 res_437} = add32(x_435, x_436)
86:           in {res_437}})
87:       (gtid_299 < impl₁_246) (~phys_tid_300) : {i32} {
88:         let {i32 x_438} = resarr0_425[gtid_299]
89:         return {returns x_438}
90:       }
91:     return {returns resarr0_434}
92:   }

There's (at least) one problem though: I don't take the allocation space into account. For instance, on line 34, color_778, a memory block residing in default memory, is assigned to mem_764, which is supposed to reside in local memory. Somehow, we need to make either the interference graph, the coloring algorithm, or the allocation reuser (or some combination of the three) aware of memory space such that only memory blocks in the same memory space are merged, and such that the new colored blocks are allocated in the correct space.

By manually setting the space to Space "local", we can inspect what the code would look like:

  1: -- xss_247 : [impl₀_245][impl₁_246]i32@@xss_mem_759->
  2: -- {base: [impl₀_245, impl₁_246]; contiguous: True; LMADs: [{offset: 0i32;
  3: --                                                           strides: [impl₁_246, 1i32];
  4: --                                                           rotates: [0i32, 0i32];
  5: --                                                           shape: [impl₀_245, impl₁_246];
  6: --                                                           permutation: [0, 1];
  7: --                                                           monotonicity: [Inc, Inc]}]}
  8: entry {*[?0][?1]i32@?2->
  9:        {base: [?0, ?1]; contiguous: True; LMADs: [{offset: 0i32;
 10:                                                    strides: [?1, 1i32];
 11:                                                    rotates: [0i32, 0i32];
 12:                                                    shape: [?0, ?1];
 13:                                                    permutation: [0, 1];
 14:                                                    monotonicity: [Inc, Inc]}]}}
 15: main (mem xss_mem_759, i32 impl₀_245, i32 impl₁_246,
 16:       [impl₀_245][impl₁_246]i32 xss_247) = {
 17:   #[incremental_flattening(only_intra)]
 18:   let {i64 binop_x_774} = sext i32 impl₀_245 to i64
 19:   #[incremental_flattening(only_intra)]
 20:   let {i64 binop_y_775} = sext i32 impl₁_246 to i64
 21:   #[incremental_flattening(only_intra)]
 22:   let {i64 binop_x_776} = mul_nw64(binop_x_774, binop_y_775)
 23:   #[incremental_flattening(only_intra)]
 24:   let {i64 bytes_773} = mul_nw64(4i64, binop_x_776)
 25:   #[incremental_flattening(only_intra)]
 26:   let {mem mem_777} =
 27:     alloc(bytes_773)
 28:   let {i64 bytes_762} = mul_nw64(4i64, binop_y_775)
 29:   #[incremental_flattening(only_intra)]
 30:   -- res_408 : [impl₀_245][impl₁_246]i32@@mem_777->
 31:   -- {base: [impl₀_245, impl₁_246]; contiguous: True; LMADs: [{offset: 0i32;
 32:   --                                                           strides: [impl₁_246, 1i32];
 33:   --                                                           rotates: [0i32, 0i32];
 34:   --                                                           shape: [impl₀_245, impl₁_246];
 35:   --                                                           permutation: [0, 1];
 36:   --                                                           monotonicity: [Inc, Inc]}]}
 37:   let {[impl₀_245][impl₁_246]i32 res_408} =
 38:     segmap_group
 39:     (#groups=impl₀_245; groupsize=impl₁_246)
 40:     (gtid_292 < impl₀_245) (~phys_tid_305) : {[impl₁_246]i32} {
 41:       let {mem@local color_778} =
 42:         alloc(bytes_762, @local)
 43:       let {mem@local color_779} =
 44:         alloc(bytes_762, @local)
 45:       -- resarr0_415 : [impl₁_246]i32@@color_778->
 46:       -- {base: [impl₁_246]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32];
 47:       --                                                rotates: [0i32];
 48:       --                                                shape: [impl₁_246];
 49:       --                                                permutation: [0];
 50:       --                                                monotonicity: [Inc]}]}
 51:       let {[impl₁_246]i32 resarr0_415} =
 52:         segscan_thread
 53:         (#groups=impl₀_245; groupsize=impl₁_246)
 54:         ({{0i32},
 55:           [],
 56:           fn {i32} (i32 x_416, i32 x_417) =>
 57:             let {i32 res_418} = add32(x_416, x_417)
 58:             in {res_418}})
 59:         (gtid_295 < impl₁_246) (~phys_tid_296) : {i32} {
 60:           let {i32 x_419} = xss_247[gtid_292, gtid_295]
 61:           return {returns x_419}
 62:         }
 63:       -- resarr0_425 : [impl₁_246]i32@@color_779->
 64:       -- {base: [impl₁_246]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32];
 65:       --                                                rotates: [0i32];
 66:       --                                                shape: [impl₁_246];
 67:       --                                                permutation: [0];
 68:       --                                                monotonicity: [Inc]}]}
 69:       let {[impl₁_246]i32 resarr0_425} =
 70:         segscan_thread
 71:         (#groups=impl₀_245; groupsize=impl₁_246)
 72:         ({{0i32},
 73:           [],
 74:           fn {i32} (i32 x_426, i32 x_427) =>
 75:             let {i32 res_428} = add32(x_426, x_427)
 76:             in {res_428}})
 77:         (gtid_297 < impl₁_246) (~phys_tid_298) : {i32} {
 78:           let {i32 x_429} = resarr0_415[gtid_297]
 79:           return {returns x_429}
 80:         }
 81:       -- resarr0_434 : [impl₁_246]i32@@color_778->
 82:       -- {base: [impl₁_246]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32];
 83:       --                                                rotates: [0i32];
 84:       --                                                shape: [impl₁_246];
 85:       --                                                permutation: [0];
 86:       --                                                monotonicity: [Inc]}]}
 87:       let {[impl₁_246]i32 resarr0_434} =
 88:         segscan_thread
 89:         (#groups=impl₀_245; groupsize=impl₁_246)
 90:         ({{0i32},
 91:           [],
 92:           fn {i32} (i32 x_435, i32 x_436) =>
 93:             let {i32 res_437} = add32(x_435, x_436)
 94:             in {res_437}})
 95:         (gtid_299 < impl₁_246) (~phys_tid_300) : {i32} {
 96:           let {i32 x_438} = resarr0_425[gtid_299]
 97:           return {returns x_438}
 98:         }
 99:       return {returns resarr0_434}
100:     }
101:   in {impl₀_245, impl₁_246, mem_777, res_408}
102: }

Promising, I think!

However, calling my program on OptionPricing.fut results in an error:

$ cabal run futhark-linear-scan -- ../futhark-benchmarks/finpar/OptionPricing.fut
Up to date
optimise graph: [(dir_vs_mem_9186, inpacc_mem_9405), (dir_vs_mem_9186, acc0_mem_9410),
 (dir_vs_mem_9186, mem_9431), (mem_9241, inpacc_mem_9405), (mem_9241,
                                                            acc0_mem_9410),
 (mem_9241, mem_9431), (mem_9258, mem_9271), (mem_9258, mem_9286), (mem_9258,
                                                                    inpacc_mem_9405),
 (mem_9258, acc0_mem_9410), (mem_9258, mem_9431), (mem_9271, inpacc_mem_9405),
 (mem_9271, acc0_mem_9410), (mem_9271, mem_9431), (mem_9286, inpacc_mem_9405),
 (mem_9286, acc0_mem_9410), (mem_9286, mem_9431), (mem_9335, mem_9347),
 (mem_9335, mem_9360), (mem_9335, inpacc_mem_9405), (mem_9335, acc0_mem_9410),
 (mem_9347, mem_9360), (mem_9347, inpacc_mem_9405), (mem_9347, acc0_mem_9410),
 (mem_9360, inpacc_mem_9405), (mem_9360, acc0_mem_9410), (mem_9388,
                                                          inpacc_mem_9405),
 (mem_9388, acc0_mem_9410), (inpacc_mem_9405, acc0_mem_9410), (inpacc_mem_9405,
                                                               mem_9431),
 (acc0_mem_9410, mem_9431)]
res:


graph: [(dir_vs_mem_9186, inpacc_mem_9405), (dir_vs_mem_9186, acc0_mem_9410),
 (dir_vs_mem_9186, mem_9431), (mem_9241, inpacc_mem_9405), (mem_9241,
                                                            acc0_mem_9410),
 (mem_9241, mem_9431), (mem_9258, mem_9271), (mem_9258, mem_9286), (mem_9258,
                                                                    inpacc_mem_9405),
 (mem_9258, acc0_mem_9410), (mem_9258, mem_9431), (mem_9271, inpacc_mem_9405),
 (mem_9271, acc0_mem_9410), (mem_9271, mem_9431), (mem_9286, inpacc_mem_9405),
 (mem_9286, acc0_mem_9410), (mem_9286, mem_9431), (mem_9335, mem_9347),
 (mem_9335, mem_9360), (mem_9335, inpacc_mem_9405), (mem_9335, acc0_mem_9410),
 (mem_9347, mem_9360), (mem_9347, inpacc_mem_9405), (mem_9347, acc0_mem_9410),
 (mem_9360, inpacc_mem_9405), (mem_9360, acc0_mem_9410), (mem_9388,
                                                          inpacc_mem_9405),
 (mem_9388, acc0_mem_9410), (inpacc_mem_9405, acc0_mem_9410), (inpacc_mem_9405,
                                                               mem_9431),
 (acc0_mem_9410, mem_9431)]
coloring: [(dir_vs_mem_9186, 3), (mem_9241, 3), (mem_9258, 4), (mem_9271, 3), (mem_9286,
                                                                     3),
 (mem_9335, 4), (mem_9347, 3), (mem_9360, 0), (mem_9388, 0), (inpacc_mem_9405,
                                                              2),
 (acc0_mem_9410, 1), (mem_9431, 0)]
inverted coloring: [(0, [mem_9360, mem_9388, mem_9431]), (1, [acc0_mem_9410]), (2,
                                                             [inpacc_mem_9405]),
 (3, [dir_vs_mem_9186, mem_9241, mem_9271, mem_9286, mem_9347]), (4, [mem_9258,
                                                                      mem_9335])]
look up: mem_9360
allocs: [(mem_9204, bytes_9195), (mem_9241, bytes_9195), (mem_9258, bytes_9254),
 (mem_9271, bytes_9269), (mem_9286, bytes_9269), (mem_9335, bytes_9198),
 (mem_9347, bytes_9254), (mem_9360, bytes_9269), (mem_9388, bytes_9198),
 (mem_9431, size_9430)]
look up: mem_9388
look up: mem_9431
look up: acc0_mem_9410
futhark-linear-scan: Map.!: given key is not an element in the map
CallStack (from HasCallStack):
  error, called at libraries/containers/containers/src/Data/Map/Internal.hs:627:17 in containers-0.6.2.1:Data.Map.Internal

There's probably an issue with the way I color the graph. For each kernel, I now try to color the whole graph, but I really only need the ones that are present in the current kernel. That may be the issue I'm running in to with regards to OptionPricing. So I should filter the graph first. Then, how do I handle the space issue? I guess the easiest thing is to make GreedyColoring aware or memory spaces.

Ah, and I also need to handle names that are not in the graph for some reason (they don't overlap with anything).

I think I've handled both now in GreedyColoring. Tomorrow, let's add some tests and try to integrate those changes in ReuseAllocations