2020-07-10

Yesterday, and the plan for today

I got a better start on the linear scan pass yesterday. I decided to use my old last-use implementation for now, since it should work alright. Later, I might want to have a LastUse lore anyway, but I'll try to get it to work without first.

Today, I have to continue writing my pass. In particular, I need to get my lookupMemInfo to work. That requires adding scoping.

Chess/perft

I ran across an interesting chess problem that we might be able to use as a benchmark. perft is a performance test and move path enumeration that is used to determine the number of legal moves in a given chess position. It seems like GPUs have successfully been used for estimating this number up to perft(15), possibly using Monte Carlo situation. It would be interesting to look into and understand the test, and see if we can write a Futhark program that computes it for us.

Some resources

ReuseAllocs

So, I got my basic pass to work today! Here's the result of a run (with debugging output):

  1: $ cabal exec futhark-linear-scan -- tests/arrays2.fut  2>&1
  2: Warning at tests/arrays2.fut:1:11-11:
  3:   Unused variable "n".
  4: stms:
  5: 
  6: last_uses?: []
  7: allocs:
  8: []
  9: frees:
 10: []
 11: last_uses:
 12: []
 13: 
 14: stms:
 15: (0, let {mem mem_260} = alloc(40i64))
 16: (1, -- res_187 : [10i32]i32@@mem_260->
 17:     -- {base: [10i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32];
 18:     --                                            rotates: [0i32]; shape: [10i32];
 19:     --                                            permutation: [0];
 20:     --                                            monotonicity: [Inc]}]}
 21:  let {[10i32]i32 res_187} = iota32(10i32, 0i32, 1i32))
 22: (2, -- xs_188 : [10i32]i32@@mem_260->
 23:     -- {base: [10i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32];
 24:     --                                            rotates: [0i32]; shape: [10i32];
 25:     --                                            permutation: [0];
 26:     --                                            monotonicity: [Inc]}]}
 27:  let {[10i32]i32 xs_188} = res_187 with [1i32] <- 0i32)
 28: (3, let {mem mem_263} = alloc(40i64))
 29: (4, -- res_189 : [10i32]i32@@mem_263->
 30:     -- {base: [10i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32];
 31:     --                                            rotates: [0i32]; shape: [10i32];
 32:     --                                            permutation: [0];
 33:     --                                            monotonicity: [Inc]}]}
 34:  let {[10i32]i32 res_189} = iota32(10i32, 0i32, 1i32))
 35: (5, -- ys_190 : [10i32]i32@@mem_263->
 36:     -- {base: [10i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32];
 37:     --                                            rotates: [0i32]; shape: [10i32];
 38:     --                                            permutation: [0];
 39:     --                                            monotonicity: [Inc]}]}
 40:  let {[10i32]i32 ys_190} = res_189 with [1i32] <- 0i32)
 41: (6, let {i32 segred_group_size_227} =
 42:       get_size(segred_group_size_226, group_size))
 43: (7, let {i32 num_groups_229} =
 44:       calc_num_groups(10i64, segred_num_groups_228, segred_group_size_227))
 45: (8, let {mem mem_267} = alloc(4i64))
 46: (9, -- acc0_231 : [1i32]i32@@mem_267->
 47:     -- {base: [1i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32];
 48:     --                                           rotates: [0i32]; shape: [1i32];
 49:     --                                           permutation: [0];
 50:     --                                           monotonicity: [Inc]}]}
 51:  let {[1i32]i32 acc0_231} =
 52:    segred_thread
 53:    (#groups=num_groups_229; groupsize=segred_group_size_227)
 54:    ({{0i32},
 55:      [],
 56:      commutative fn {i32} (i32 x_201, i32 x_202) =>
 57:        let {i32 res_203} = add32(x_201, x_202)
 58:        in {res_203}})
 59:    (dummy_232 < 1i32, gtid_233 < 10i32) (~phys_tid_234) : {i32} {
 60:      let {i32 x_204} = xs_188[gtid_233]
 61:      return {returns x_204}
 62:    })
 63: (10, let {i32 acc0_200} = acc0_231[0i32])
 64: (11, let {i32 segscan_group_size_238} =
 65:        get_size(segscan_group_size_237, group_size))
 66: (12, let {i32 num_groups_240} =
 67:        calc_num_groups(10i64, segscan_num_groups_239, segscan_group_size_238))
 68: (13, let {mem mem_271} = alloc(40i64))
 69: (14, -- resarr0_206 : [10i32]i32@@mem_271->
 70:      -- {base: [10i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32];
 71:      --                                            rotates: [0i32]; shape: [10i32];
 72:      --                                            permutation: [0];
 73:      --                                            monotonicity: [Inc]}]}
 74:  let {[10i32]i32 resarr0_206} =
 75:    segscan_thread
 76:    (#groups=num_groups_240; groupsize=segscan_group_size_238)
 77:    ({{0i32},
 78:      [],
 79:      fn {i32} (i32 x_207, i32 x_208) =>
 80:        let {i32 res_209} = add32(x_207, x_208)
 81:        in {res_209}})
 82:    (gtid_242 < 10i32) (~phys_tid_243) : {i32} {
 83:      let {i32 x_210} = ys_190[gtid_242]
 84:      return {returns x_210}
 85:    })
 86: (15, let {i32 segred_group_size_247} =
 87:        get_size(segred_group_size_246, group_size))
 88: (16, let {i32 num_groups_249} =
 89:        calc_num_groups(10i64, segred_num_groups_248, segred_group_size_247))
 90: (17, let {mem mem_275} = alloc(4i64))
 91: (18, -- acc0_251 : [1i32]i32@@mem_275->
 92:      -- {base: [1i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32];
 93:      --                                           rotates: [0i32]; shape: [1i32];
 94:      --                                           permutation: [0];
 95:      --                                           monotonicity: [Inc]}]}
 96:  let {[1i32]i32 acc0_251} =
 97:    segred_thread
 98:    (#groups=num_groups_249; groupsize=segred_group_size_247)
 99:    ({{0i32},
100:      [],
101:      commutative fn {i32} (i32 x_217, i32 x_218) =>
102:        let {i32 res_219} = add32(x_217, x_218)
103:        in {res_219}})
104:    (dummy_252 < 1i32, gtid_253 < 10i32) (~phys_tid_254) : {i32} {
105:      let {i32 x_220} = resarr0_206[gtid_253]
106:      return {returns x_220}
107:    })
108: (19, let {i32 acc0_216} = acc0_251[0i32])
109: (20, let {i32 res_223} = add32(acc0_200, acc0_216))
110: 
111: last_uses?: [(2, res_187), (2, mem_260), (5, res_189), (5, mem_263), (9, xs_188), (9,
112:                                                                        segred_group_size_227),
113:  (9, num_groups_229), (9, gtid_233), (9, mem_267), (10, acc0_231), (14, ys_190),
114:  (14, segscan_group_size_238), (14, num_groups_240), (14, gtid_242), (14,
115:                                                                       mem_271),
116:  (18, resarr0_206), (18, segred_group_size_247), (18, num_groups_249), (18,
117:                                                                         gtid_253),
118:  (18, mem_275), (19, acc0_251), (20, acc0_200), (20, acc0_216)]
119: 1 adding new frees: []
120: 2 adding new frees: [(VName (Name "mem") 260,Constant (IntValue (Int64Value 40)))]
121: 3 lookup: 40i64
122: 3 found a result: VName (Name "mem") 260
123: 4 adding new frees: []
124: 5 adding new frees: []
125: 6 adding new frees: []
126: 7 adding new frees: []
127: 8 lookup: 4i64
128: 9 adding new frees: [(VName (Name "mem") 260,Constant (IntValue (Int64Value 40)))]
129: 10 adding new frees: [(VName (Name "mem") 267,Constant (IntValue (Int64Value 4)))]
130: 11 adding new frees: []
131: 12 adding new frees: []
132: 13 lookup: 40i64
133: 13 found a result: VName (Name "mem") 260
134: 14 adding new frees: []
135: 15 adding new frees: []
136: 16 adding new frees: []
137: 17 lookup: 4i64
138: 17 found a result: VName (Name "mem") 267
139: 18 adding new frees: []
140: 19 adding new frees: []
141: 20 adding new frees: []
142: allocs:
143: [(VName (Name "mem") 267,Constant (IntValue (Int64Value 4))),(VName (Name "mem") 260,Constant (IntValue (Int64Value 40)))]
144: frees:
145: [(VName (Name "mem") 267,Constant (IntValue (Int64Value 4))),(VName (Name "mem") 260,Constant (IntValue (Int64Value 40))),(VName (Name "mem") 260,Constant (IntValue (Int64Value 40)))]
146: last_uses:
147: []
148: 
149: entry {i32} main (i32 n_186) = {
150:   let {mem mem_260} =
151:     alloc(40i64)
152:   -- res_187 : [10i32]i32@@mem_260->
153:   -- {base: [10i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32];
154:   --                                            rotates: [0i32]; shape: [10i32];
155:   --                                            permutation: [0];
156:   --                                            monotonicity: [Inc]}]}
157:   let {[10i32]i32 res_187} = iota32(10i32, 0i32, 1i32)
158:   -- xs_188 : [10i32]i32@@mem_260->
159:   -- {base: [10i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32];
160:   --                                            rotates: [0i32]; shape: [10i32];
161:   --                                            permutation: [0];
162:   --                                            monotonicity: [Inc]}]}
163:   let {[10i32]i32 xs_188} =
164:     -- Consumes res_187
165:     res_187 with [1i32] <- 0i32
166:   -- mem_263 aliases mem_260
167:   let {mem mem_263} = mem_260
168:   -- res_189 : [10i32]i32@@mem_263->
169:   -- {base: [10i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32];
170:   --                                            rotates: [0i32]; shape: [10i32];
171:   --                                            permutation: [0];
172:   --                                            monotonicity: [Inc]}]}
173:   let {[10i32]i32 res_189} = iota32(10i32, 0i32, 1i32)
174:   -- ys_190 : [10i32]i32@@mem_263->
175:   -- {base: [10i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32];
176:   --                                            rotates: [0i32]; shape: [10i32];
177:   --                                            permutation: [0];
178:   --                                            monotonicity: [Inc]}]}
179:   let {[10i32]i32 ys_190} =
180:     -- Consumes res_189
181:     res_189 with [1i32] <- 0i32
182:   let {i32 segred_group_size_227} =
183:     get_size(segred_group_size_226, group_size)
184:   let {i32 num_groups_229} =
185:     calc_num_groups(10i64, segred_num_groups_228, segred_group_size_227)
186:   let {mem mem_267} =
187:     alloc(4i64)
188:   -- acc0_231 : [1i32]i32@@mem_267->
189:   -- {base: [1i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32];
190:   --                                           rotates: [0i32]; shape: [1i32];
191:   --                                           permutation: [0];
192:   --                                           monotonicity: [Inc]}]}
193:   let {[1i32]i32 acc0_231} =
194:     segred_thread
195:     (#groups=num_groups_229; groupsize=segred_group_size_227)
196:     ({{0i32},
197:       [],
198:       commutative fn {i32} (i32 x_201, i32 x_202) =>
199:         let {i32 res_203} = add32(x_201, x_202)
200:         in {res_203}})
201:     (dummy_232 < 1i32, gtid_233 < 10i32) (~phys_tid_234) : {i32} {
202:       let {i32 x_204} = xs_188[gtid_233]
203:       return {returns x_204}
204:     }
205:   let {i32 acc0_200} = acc0_231[0i32]
206:   let {i32 segscan_group_size_238} =
207:     get_size(segscan_group_size_237, group_size)
208:   let {i32 num_groups_240} =
209:     calc_num_groups(10i64, segscan_num_groups_239, segscan_group_size_238)
210:   -- mem_271 aliases mem_260
211:   let {mem mem_271} = mem_260
212:   -- resarr0_206 : [10i32]i32@@mem_271->
213:   -- {base: [10i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32];
214:   --                                            rotates: [0i32]; shape: [10i32];
215:   --                                            permutation: [0];
216:   --                                            monotonicity: [Inc]}]}
217:   let {[10i32]i32 resarr0_206} =
218:     segscan_thread
219:     (#groups=num_groups_240; groupsize=segscan_group_size_238)
220:     ({{0i32},
221:       [],
222:       fn {i32} (i32 x_207, i32 x_208) =>
223:         let {i32 res_209} = add32(x_207, x_208)
224:         in {res_209}})
225:     (gtid_242 < 10i32) (~phys_tid_243) : {i32} {
226:       let {i32 x_210} = ys_190[gtid_242]
227:       return {returns x_210}
228:     }
229:   let {i32 segred_group_size_247} =
230:     get_size(segred_group_size_246, group_size)
231:   let {i32 num_groups_249} =
232:     calc_num_groups(10i64, segred_num_groups_248, segred_group_size_247)
233:   -- mem_275 aliases mem_267
234:   let {mem mem_275} = mem_267
235:   -- acc0_251 : [1i32]i32@@mem_275->
236:   -- {base: [1i32]; contiguous: True; LMADs: [{offset: 0i32; strides: [1i32];
237:   --                                           rotates: [0i32]; shape: [1i32];
238:   --                                           permutation: [0];
239:   --                                           monotonicity: [Inc]}]}
240:   let {[1i32]i32 acc0_251} =
241:     segred_thread
242:     (#groups=num_groups_249; groupsize=segred_group_size_247)
243:     ({{0i32},
244:       [],
245:       commutative fn {i32} (i32 x_217, i32 x_218) =>
246:         let {i32 res_219} = add32(x_217, x_218)
247:         in {res_219}})
248:     (dummy_252 < 1i32, gtid_253 < 10i32) (~phys_tid_254) : {i32} {
249:       let {i32 x_220} = resarr0_206[gtid_253]
250:       return {returns x_220}
251:     }
252:   let {i32 acc0_216} = acc0_251[0i32]
253:   let {i32 res_223} = add32(acc0_200, acc0_216)
254:   in {res_223}
255: }
256:

As you can see on line 167, it succesfully changes the allocation to reuse an existing allocation! Unfortunately, my last-use analysis is far too simple, even with aliasing. From the debugging output, we can see that mem_260 is reported as being last-used on line 157, when in reality it is used by all references to xs_187. Furthermore, even if we fixed that so the last-use of mem_260 was equal to the last-use of xs_187, that happens immediately after, on line 165, but that's still not the actual last-use of mem_260: res_187 is consumed in the update which creates res_188, which still resides in mem_260.

I knew all of this would be problems with my last-use analysis, but it still feels nice to see that the pass works.

There are many more issues with the pass: It needs to handle nested blocks, and in reality it probably should only concern itself with code inside kernels. But now that I have a working foundation, I feel like the time is right to go back and implement Cosmins more advanced last-use analysis. I clearly need it.