Tensor Tiling Library
 
Loading...
Searching...
No Matches
pipelines/TTL_simplex_scheme.h
Go to the documentation of this file.
1/*
2 * TTL_simplex_scheme.h
3 *
4 * Copyright (c) 2025 Mobileye
5 *
6 * Licensed under the Apache License, Version 2.0 (the License);
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an AS IS BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19// clang-format off
20/**
21 * @file
22 *
23 * TTL_simplex_buffering pipelines a pair of import and export transactions using
24 * three internal buffers, in rotation: each buffer interchangeably serves as input
25 * buffer and output buffer, such that in each iteration one buffer is used both to
26 * export then import and two buffers are used by compute for reading and writing.
27 *
28 * With simplex buffering we're only waiting for previous iterations, so DMA
29 * transactions run mostly in parallel to computation, but serially with each
30 * other. Using the same buffer both for import and export is possible allowing us
31 * to overlap exporting from and importing to the same buffer.
32 *
33 * The following table draws the pipelined actions performed in simplex buffering.
34 * It specifies which tile is processed in each iteration:
35 *
36 * | Action\\Iteration | \#-1 | \#0 | \#1 | \#2 | \#i (2:NumOfTiles-2) | \#NumOfTiles-1 | \#NumOfTiles | \#NumOfTiles+1 |
37 * |-------------------|------|-----|-----|-----|----------------------|----------------|--------------|----------------|
38 * | **WaitExport** | | | | 0 | i-2 | NumOfTiles-3 | NumOfTiles-2 | NumOfTiles-1 |
39 * | **Export** | | | 0 | 1 | i-1 | NumOfTiles-2 | NumOfTiles-1 | |
40 * | **Wait Import** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
41 * | **Import** | 0 | 1 | 2 | 3 | i+1 | | | |
42 * | **Compute** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
43 *
44 * Notice the prolog (at iteration number -1) and the 2 epilogs (at iterations
45 * number NumOfTiles and NumOfTiles+1) which add in total 3 extra iterations.
46 *
47 * @example TTL_simplex_buffering.cl
48 */
49// clang-format on
50
51// This file presumes that the following have been pre included.
52// this is not done here for path reasons.
53// #include "TTL_core.h"
54// #include "TTL_import_export.h"
55// #include TTL_IMPORT_EXPORT_INCLUDE_H
56
57/**
58 * @def The structs used for this buffering type
59 */
60// TTL_simplex_buffering_t
61typedef struct {
62 struct {
63 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
64 0->1->0->1... etc */
65 __local void *int_base[3]; /*!< The internal base addresses of the pipelined tiles. */
66 TTL_ext_void_tensor_t ext_tensor_in; /*!< The external tensor being input */
67 TTL_ext_void_tensor_t ext_tensor_out; /*!< The external tensor being output */
68 } common; ///< The information that is common to all pipeline schemes
69
72 // Cache previous gotten tiles.
74 TTL_int_void_sub_tensor_t int_prev_imported; // Cache previously imported internal buffer.
76
77/**
78 * Simple declarations for file ordering purposes
79 */
80static inline TTL_io_void_tensor_t __attribute__((overloadable)) TTL_step_buffering(
81 TTL_simplex_const_void_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
82 TTL_tile_t tile_current_export);
83
84/**
85 * @brief Create a TTL_simplex_buffering_t and begin the buffering process
86 *
87 * @param int_base1 The address of the first buffer to be used in local memory
88 * @param int_base2 The address of the second buffer to be used in local memory
89 * @param int_base3 The address of the third buffer to be used in local memory
90 * @param ext_tensor_in The external tensor to import the input data from
91 * @param ext_tensor_out The external tensor to export the output data to
92 * @param event_in A pointer to the event to use for the inward (external to
93 * internal) transfer completion
94 * @param event_out A pointer to the event to use for the inward (internal to
95 * external) transfer completion
96 * @param first_tile The first tile to fetch for the scheme
97 *
98 * Solid description of TTL_double_double_buffering_t buffering here
99 *
100 * @return The TTL_simplex_buffering_t created from the input parameters
101 *
102 * Example:
103 * @code
104 * TTL_event_t tb_e_in = TTL_get_event();
105 * TTL_event_t tb_e_out = TTL_get_event();
106 * TTL_simplex_buffering_t tb_scheme = TTL_start_simplex_buffering(
107 * ext_base_in, ext_base_out, l_buff1, l_buff2, l_buff3, ext_layout_in,
108 * ext_layout_out, &tb_e_in, &tb_e_out);
109 * @endcode
110 * \n
111 *
112 * This can be optimized and standardized using the TTL_step_buffering
113 * call.
114 *
115 * @startuml
116 *
117 * start
118 *
119 *
120 * stop
121 *
122 * @enduml
123 *
124 */
126 __local void *int_base1, __local void *int_base2, __local void *int_base3, TTL_ext_void_tensor_t ext_tensor_in,
127 TTL_ext_void_tensor_t ext_tensor_out, TTL_event_t *event_in, TTL_event_t *event_out, TTL_tile_t first_tile) {
129
130 result.common.int_base[0] = int_base1;
131 result.common.int_base[1] = int_base2;
132 result.common.int_base[2] = int_base3;
133 result.common.ext_tensor_in = ext_tensor_in;
134 result.common.ext_tensor_out = ext_tensor_out;
135 result.event_in = event_in;
136 result.event_out = event_out;
138
139 result.common.index = 0;
140
142
143 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
144
145 return result;
146}
147
148static inline TTL_io_void_tensor_t __attribute__((overloadable)) TTL_step_buffering(
149 TTL_simplex_const_void_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
150 TTL_tile_t tile_current_export) {
151 // For performance, compute everything possible before waiting for the previous operations to finish. The current
152 // index contains the tile that is to be exported, so prepare the structures before beginning the export and export.
153 const TTL_layout_t next_import_layout =
154 TTL_create_layout(tile_next_import.shape.width, tile_next_import.shape.height);
155 const TTL_int_void_sub_tensor_t next_import_int_sub_tensor =
156 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
157 tile_next_import.shape,
158 next_import_layout,
159 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
160 tile_next_import.offset);
161 const TTL_const_ext_void_tensor_t next_import_ext_tensor =
163 tile_next_import.shape,
164 simplex_buffer->common.ext_tensor_in.layout,
165 tile_next_import.offset,
166 simplex_buffer->common.ext_tensor_in.elem_size);
167
168 const TTL_layout_t int_export_layout = TTL_create_layout(simplex_buffer->next_exported_tile.shape.width,
169 simplex_buffer->next_exported_tile.shape.height);
170 const TTL_int_void_tensor_t int_export_tensor =
171 TTL_create_int_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
172 simplex_buffer->next_exported_tile.shape,
173 int_export_layout,
174 simplex_buffer->common.ext_tensor_out.elem_size);
175 const TTL_ext_void_tensor_t export_to = TTL_create_ext_tensor(simplex_buffer->common.ext_tensor_out.base,
176 simplex_buffer->next_exported_tile.shape,
177 simplex_buffer->common.ext_tensor_out.layout,
178 simplex_buffer->next_exported_tile.offset,
179 simplex_buffer->common.ext_tensor_out.elem_size);
180
181 // Wait for the previous (import/export)s to complete before starting the next.
182 TTL_wait(1, simplex_buffer->event_out);
183 TTL_wait(1, simplex_buffer->event_in);
184
185 if (TTL_tile_empty(simplex_buffer->next_exported_tile) == false)
187 *TTL_to_void_tensor(&export_to),
188 simplex_buffer->event_out);
189
190 if (TTL_tile_empty(tile_next_import) == false)
191 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
192 *TTL_to_void_tensor(TTL_to_const_tensor(&next_import_ext_tensor)),
193 simplex_buffer->event_in);
194
195 // The import/export has been started for the current tile, Move to the next
196 // tile.
197 simplex_buffer->common.index =
198 (simplex_buffer->common.index + 1) %
199 (sizeof((simplex_buffer->common.int_base)) / sizeof((simplex_buffer->common.int_base)[0])); // Write to.
200
201 // Retrieve buffer imported previously to read from now.
202 const TTL_int_void_sub_tensor_t int_curr_buff_in = simplex_buffer->int_prev_imported;
203 simplex_buffer->int_prev_imported = next_import_int_sub_tensor;
204
205 // Can write to out buffer according to size of curr_tile, rather than size
206 // recently exported.
207 const TTL_layout_t curr_int_layout =
208 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.width);
209 const TTL_int_void_sub_tensor_t int_curr_buff_out =
210 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
211 tile_current_export.shape,
212 curr_int_layout,
213 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
214 tile_current_export.offset);
215
216 // Save last two tiles to prevent common repeated get_tile()'s.
217 simplex_buffer->next_exported_tile = tile_current_export;
218
219 return TTL_create_io_tensors(int_curr_buff_in, int_curr_buff_out);
220}
221
222static inline void __attribute__((overloadable)) TTL_finish_buffering(
223 TTL_simplex_const_void_tensor_buffering_t *const simplex_buffering) {
226}
227/*
228 * TTL_simplex_scheme.h
229 *
230 * Copyright (c) 2025 Mobileye
231 *
232 * Licensed under the Apache License, Version 2.0 (the License);
233 * you may not use this file except in compliance with the License.
234 * You may obtain a copy of the License at
235 *
236 * http://www.apache.org/licenses/LICENSE-2.0
237 *
238 * Unless required by applicable law or agreed to in writing, software
239 * distributed under the License is distributed on an AS IS BASIS,
240 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
241 * See the License for the specific language governing permissions and
242 * limitations under the License.
243 */
244
245// clang-format off
246/**
247 * @file
248 *
249 * TTL_simplex_buffering pipelines a pair of import and export transactions using
250 * three internal buffers, in rotation: each buffer interchangeably serves as input
251 * buffer and output buffer, such that in each iteration one buffer is used both to
252 * export then import and two buffers are used by compute for reading and writing.
253 *
254 * With simplex buffering we're only waiting for previous iterations, so DMA
255 * transactions run mostly in parallel to computation, but serially with each
256 * other. Using the same buffer both for import and export is possible allowing us
257 * to overlap exporting from and importing to the same buffer.
258 *
259 * The following table draws the pipelined actions performed in simplex buffering.
260 * It specifies which tile is processed in each iteration:
261 *
262 * | Action\\Iteration | \#-1 | \#0 | \#1 | \#2 | \#i (2:NumOfTiles-2) | \#NumOfTiles-1 | \#NumOfTiles | \#NumOfTiles+1 |
263 * |-------------------|------|-----|-----|-----|----------------------|----------------|--------------|----------------|
264 * | **WaitExport** | | | | 0 | i-2 | NumOfTiles-3 | NumOfTiles-2 | NumOfTiles-1 |
265 * | **Export** | | | 0 | 1 | i-1 | NumOfTiles-2 | NumOfTiles-1 | |
266 * | **Wait Import** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
267 * | **Import** | 0 | 1 | 2 | 3 | i+1 | | | |
268 * | **Compute** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
269 *
270 * Notice the prolog (at iteration number -1) and the 2 epilogs (at iterations
271 * number NumOfTiles and NumOfTiles+1) which add in total 3 extra iterations.
272 *
273 * @example TTL_simplex_buffering.cl
274 */
275// clang-format on
276
277// This file presumes that the following have been pre included.
278// this is not done here for path reasons.
279// #include "TTL_core.h"
280// #include "TTL_import_export.h"
281// #include TTL_IMPORT_EXPORT_INCLUDE_H
282
283/**
284 * @def The structs used for this buffering type
285 */
286// TTL_simplex_buffering_t
287typedef struct {
288 struct {
289 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
290 0->1->0->1... etc */
291 __local char *int_base[3]; /*!< The internal base addresses of the pipelined tiles. */
292 TTL_ext_char_tensor_t ext_tensor_in; /*!< The external tensor being input */
293 TTL_ext_char_tensor_t ext_tensor_out; /*!< The external tensor being output */
294 } common; ///< The information that is common to all pipeline schemes
295
298 // Cache previous gotten tiles.
300 TTL_int_char_sub_tensor_t int_prev_imported; // Cache previously imported internal buffer.
302
303/**
304 * Simple declarations for file ordering purposes
305 */
306static inline TTL_io_char_tensor_t __attribute__((overloadable)) TTL_step_buffering(
307 TTL_simplex_const_char_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
308 TTL_tile_t tile_current_export);
309
310/**
311 * @brief Create a TTL_simplex_buffering_t and begin the buffering process
312 *
313 * @param int_base1 The address of the first buffer to be used in local memory
314 * @param int_base2 The address of the second buffer to be used in local memory
315 * @param int_base3 The address of the third buffer to be used in local memory
316 * @param ext_tensor_in The external tensor to import the input data from
317 * @param ext_tensor_out The external tensor to export the output data to
318 * @param event_in A pointer to the event to use for the inward (external to
319 * internal) transfer completion
320 * @param event_out A pointer to the event to use for the inward (internal to
321 * external) transfer completion
322 * @param first_tile The first tile to fetch for the scheme
323 *
324 * Solid description of TTL_double_double_buffering_t buffering here
325 *
326 * @return The TTL_simplex_buffering_t created from the input parameters
327 *
328 * Example:
329 * @code
330 * TTL_event_t tb_e_in = TTL_get_event();
331 * TTL_event_t tb_e_out = TTL_get_event();
332 * TTL_simplex_buffering_t tb_scheme = TTL_start_simplex_buffering(
333 * ext_base_in, ext_base_out, l_buff1, l_buff2, l_buff3, ext_layout_in,
334 * ext_layout_out, &tb_e_in, &tb_e_out);
335 * @endcode
336 * \n
337 *
338 * This can be optimized and standardized using the TTL_step_buffering
339 * call.
340 *
341 * @startuml
342 *
343 * start
344 *
345 *
346 * stop
347 *
348 * @enduml
349 *
350 */
352 __local char *int_base1, __local char *int_base2, __local char *int_base3, TTL_ext_char_tensor_t ext_tensor_in,
353 TTL_ext_char_tensor_t ext_tensor_out, TTL_event_t *event_in, TTL_event_t *event_out, TTL_tile_t first_tile) {
355
356 result.common.int_base[0] = int_base1;
357 result.common.int_base[1] = int_base2;
358 result.common.int_base[2] = int_base3;
359 result.common.ext_tensor_in = ext_tensor_in;
360 result.common.ext_tensor_out = ext_tensor_out;
361 result.event_in = event_in;
362 result.event_out = event_out;
364
365 result.common.index = 0;
366
368
369 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
370
371 return result;
372}
373
374static inline TTL_io_char_tensor_t __attribute__((overloadable)) TTL_step_buffering(
375 TTL_simplex_const_char_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
376 TTL_tile_t tile_current_export) {
377 // For performance, compute everything possible before waiting for the previous operations to finish. The current
378 // index contains the tile that is to be exported, so prepare the structures before beginning the export and export.
379 const TTL_layout_t next_import_layout =
380 TTL_create_layout(tile_next_import.shape.width, tile_next_import.shape.height);
381 const TTL_int_char_sub_tensor_t next_import_int_sub_tensor =
382 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
383 tile_next_import.shape,
384 next_import_layout,
385 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
386 tile_next_import.offset);
387 const TTL_const_ext_char_tensor_t next_import_ext_tensor =
389 tile_next_import.shape,
390 simplex_buffer->common.ext_tensor_in.layout,
391 tile_next_import.offset,
392 simplex_buffer->common.ext_tensor_in.elem_size);
393
394 const TTL_layout_t int_export_layout = TTL_create_layout(simplex_buffer->next_exported_tile.shape.width,
395 simplex_buffer->next_exported_tile.shape.height);
396 const TTL_int_char_tensor_t int_export_tensor =
397 TTL_create_int_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
398 simplex_buffer->next_exported_tile.shape,
399 int_export_layout,
400 simplex_buffer->common.ext_tensor_out.elem_size);
401 const TTL_ext_char_tensor_t export_to = TTL_create_ext_tensor(simplex_buffer->common.ext_tensor_out.base,
402 simplex_buffer->next_exported_tile.shape,
403 simplex_buffer->common.ext_tensor_out.layout,
404 simplex_buffer->next_exported_tile.offset,
405 simplex_buffer->common.ext_tensor_out.elem_size);
406
407 // Wait for the previous (import/export)s to complete before starting the next.
408 TTL_wait(1, simplex_buffer->event_out);
409 TTL_wait(1, simplex_buffer->event_in);
410
411 if (TTL_tile_empty(simplex_buffer->next_exported_tile) == false)
413 *TTL_to_void_tensor(&export_to),
414 simplex_buffer->event_out);
415
416 if (TTL_tile_empty(tile_next_import) == false)
417 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
418 *TTL_to_void_tensor(TTL_to_const_tensor(&next_import_ext_tensor)),
419 simplex_buffer->event_in);
420
421 // The import/export has been started for the current tile, Move to the next
422 // tile.
423 simplex_buffer->common.index =
424 (simplex_buffer->common.index + 1) %
425 (sizeof((simplex_buffer->common.int_base)) / sizeof((simplex_buffer->common.int_base)[0])); // Write to.
426
427 // Retrieve buffer imported previously to read from now.
428 const TTL_int_char_sub_tensor_t int_curr_buff_in = simplex_buffer->int_prev_imported;
429 simplex_buffer->int_prev_imported = next_import_int_sub_tensor;
430
431 // Can write to out buffer according to size of curr_tile, rather than size
432 // recently exported.
433 const TTL_layout_t curr_int_layout =
434 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.width);
435 const TTL_int_char_sub_tensor_t int_curr_buff_out =
436 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
437 tile_current_export.shape,
438 curr_int_layout,
439 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
440 tile_current_export.offset);
441
442 // Save last two tiles to prevent common repeated get_tile()'s.
443 simplex_buffer->next_exported_tile = tile_current_export;
444
445 return TTL_create_io_tensors(int_curr_buff_in, int_curr_buff_out);
446}
447
448static inline void __attribute__((overloadable)) TTL_finish_buffering(
449 TTL_simplex_const_char_tensor_buffering_t *const simplex_buffering) {
452}
453/*
454 * TTL_simplex_scheme.h
455 *
456 * Copyright (c) 2025 Mobileye
457 *
458 * Licensed under the Apache License, Version 2.0 (the License);
459 * you may not use this file except in compliance with the License.
460 * You may obtain a copy of the License at
461 *
462 * http://www.apache.org/licenses/LICENSE-2.0
463 *
464 * Unless required by applicable law or agreed to in writing, software
465 * distributed under the License is distributed on an AS IS BASIS,
466 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
467 * See the License for the specific language governing permissions and
468 * limitations under the License.
469 */
470
471// clang-format off
472/**
473 * @file
474 *
475 * TTL_simplex_buffering pipelines a pair of import and export transactions using
476 * three internal buffers, in rotation: each buffer interchangeably serves as input
477 * buffer and output buffer, such that in each iteration one buffer is used both to
478 * export then import and two buffers are used by compute for reading and writing.
479 *
480 * With simplex buffering we're only waiting for previous iterations, so DMA
481 * transactions run mostly in parallel to computation, but serially with each
482 * other. Using the same buffer both for import and export is possible allowing us
483 * to overlap exporting from and importing to the same buffer.
484 *
485 * The following table draws the pipelined actions performed in simplex buffering.
486 * It specifies which tile is processed in each iteration:
487 *
488 * | Action\\Iteration | \#-1 | \#0 | \#1 | \#2 | \#i (2:NumOfTiles-2) | \#NumOfTiles-1 | \#NumOfTiles | \#NumOfTiles+1 |
489 * |-------------------|------|-----|-----|-----|----------------------|----------------|--------------|----------------|
490 * | **WaitExport** | | | | 0 | i-2 | NumOfTiles-3 | NumOfTiles-2 | NumOfTiles-1 |
491 * | **Export** | | | 0 | 1 | i-1 | NumOfTiles-2 | NumOfTiles-1 | |
492 * | **Wait Import** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
493 * | **Import** | 0 | 1 | 2 | 3 | i+1 | | | |
494 * | **Compute** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
495 *
496 * Notice the prolog (at iteration number -1) and the 2 epilogs (at iterations
497 * number NumOfTiles and NumOfTiles+1) which add in total 3 extra iterations.
498 *
499 * @example TTL_simplex_buffering.cl
500 */
501// clang-format on
502
503// This file presumes that the following have been pre included.
504// this is not done here for path reasons.
505// #include "TTL_core.h"
506// #include "TTL_import_export.h"
507// #include TTL_IMPORT_EXPORT_INCLUDE_H
508
509/**
510 * @def The structs used for this buffering type
511 */
512// TTL_simplex_buffering_t
513typedef struct {
514 struct {
515 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
516 0->1->0->1... etc */
517 __local uchar *int_base[3]; /*!< The internal base addresses of the pipelined tiles. */
518 TTL_ext_uchar_tensor_t ext_tensor_in; /*!< The external tensor being input */
519 TTL_ext_uchar_tensor_t ext_tensor_out; /*!< The external tensor being output */
520 } common; ///< The information that is common to all pipeline schemes
521
524 // Cache previous gotten tiles.
526 TTL_int_uchar_sub_tensor_t int_prev_imported; // Cache previously imported internal buffer.
528
529/**
530 * Simple declarations for file ordering purposes
531 */
532static inline TTL_io_uchar_tensor_t __attribute__((overloadable)) TTL_step_buffering(
533 TTL_simplex_const_uchar_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
534 TTL_tile_t tile_current_export);
535
536/**
537 * @brief Create a TTL_simplex_buffering_t and begin the buffering process
538 *
539 * @param int_base1 The address of the first buffer to be used in local memory
540 * @param int_base2 The address of the second buffer to be used in local memory
541 * @param int_base3 The address of the third buffer to be used in local memory
542 * @param ext_tensor_in The external tensor to import the input data from
543 * @param ext_tensor_out The external tensor to export the output data to
544 * @param event_in A pointer to the event to use for the inward (external to
545 * internal) transfer completion
546 * @param event_out A pointer to the event to use for the inward (internal to
547 * external) transfer completion
548 * @param first_tile The first tile to fetch for the scheme
549 *
550 * Solid description of TTL_double_double_buffering_t buffering here
551 *
552 * @return The TTL_simplex_buffering_t created from the input parameters
553 *
554 * Example:
555 * @code
556 * TTL_event_t tb_e_in = TTL_get_event();
557 * TTL_event_t tb_e_out = TTL_get_event();
558 * TTL_simplex_buffering_t tb_scheme = TTL_start_simplex_buffering(
559 * ext_base_in, ext_base_out, l_buff1, l_buff2, l_buff3, ext_layout_in,
560 * ext_layout_out, &tb_e_in, &tb_e_out);
561 * @endcode
562 * \n
563 *
564 * This can be optimized and standardized using the TTL_step_buffering
565 * call.
566 *
567 * @startuml
568 *
569 * start
570 *
571 *
572 * stop
573 *
574 * @enduml
575 *
576 */
578 __local uchar *int_base1, __local uchar *int_base2, __local uchar *int_base3, TTL_ext_uchar_tensor_t ext_tensor_in,
579 TTL_ext_uchar_tensor_t ext_tensor_out, TTL_event_t *event_in, TTL_event_t *event_out, TTL_tile_t first_tile) {
581
582 result.common.int_base[0] = int_base1;
583 result.common.int_base[1] = int_base2;
584 result.common.int_base[2] = int_base3;
585 result.common.ext_tensor_in = ext_tensor_in;
586 result.common.ext_tensor_out = ext_tensor_out;
587 result.event_in = event_in;
588 result.event_out = event_out;
590
591 result.common.index = 0;
592
594
595 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
596
597 return result;
598}
599
600static inline TTL_io_uchar_tensor_t __attribute__((overloadable)) TTL_step_buffering(
601 TTL_simplex_const_uchar_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
602 TTL_tile_t tile_current_export) {
603 // For performance, compute everything possible before waiting for the previous operations to finish. The current
604 // index contains the tile that is to be exported, so prepare the structures before beginning the export and export.
605 const TTL_layout_t next_import_layout =
606 TTL_create_layout(tile_next_import.shape.width, tile_next_import.shape.height);
607 const TTL_int_uchar_sub_tensor_t next_import_int_sub_tensor =
608 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
609 tile_next_import.shape,
610 next_import_layout,
611 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
612 tile_next_import.offset);
613 const TTL_const_ext_uchar_tensor_t next_import_ext_tensor =
615 tile_next_import.shape,
616 simplex_buffer->common.ext_tensor_in.layout,
617 tile_next_import.offset,
618 simplex_buffer->common.ext_tensor_in.elem_size);
619
620 const TTL_layout_t int_export_layout = TTL_create_layout(simplex_buffer->next_exported_tile.shape.width,
621 simplex_buffer->next_exported_tile.shape.height);
622 const TTL_int_uchar_tensor_t int_export_tensor =
623 TTL_create_int_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
624 simplex_buffer->next_exported_tile.shape,
625 int_export_layout,
626 simplex_buffer->common.ext_tensor_out.elem_size);
628 simplex_buffer->next_exported_tile.shape,
629 simplex_buffer->common.ext_tensor_out.layout,
630 simplex_buffer->next_exported_tile.offset,
631 simplex_buffer->common.ext_tensor_out.elem_size);
632
633 // Wait for the previous (import/export)s to complete before starting the next.
634 TTL_wait(1, simplex_buffer->event_out);
635 TTL_wait(1, simplex_buffer->event_in);
636
637 if (TTL_tile_empty(simplex_buffer->next_exported_tile) == false)
639 *TTL_to_void_tensor(&export_to),
640 simplex_buffer->event_out);
641
642 if (TTL_tile_empty(tile_next_import) == false)
643 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
644 *TTL_to_void_tensor(TTL_to_const_tensor(&next_import_ext_tensor)),
645 simplex_buffer->event_in);
646
647 // The import/export has been started for the current tile, Move to the next
648 // tile.
649 simplex_buffer->common.index =
650 (simplex_buffer->common.index + 1) %
651 (sizeof((simplex_buffer->common.int_base)) / sizeof((simplex_buffer->common.int_base)[0])); // Write to.
652
653 // Retrieve buffer imported previously to read from now.
654 const TTL_int_uchar_sub_tensor_t int_curr_buff_in = simplex_buffer->int_prev_imported;
655 simplex_buffer->int_prev_imported = next_import_int_sub_tensor;
656
657 // Can write to out buffer according to size of curr_tile, rather than size
658 // recently exported.
659 const TTL_layout_t curr_int_layout =
660 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.width);
661 const TTL_int_uchar_sub_tensor_t int_curr_buff_out =
662 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
663 tile_current_export.shape,
664 curr_int_layout,
665 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
666 tile_current_export.offset);
667
668 // Save last two tiles to prevent common repeated get_tile()'s.
669 simplex_buffer->next_exported_tile = tile_current_export;
670
671 return TTL_create_io_tensors(int_curr_buff_in, int_curr_buff_out);
672}
673
674static inline void __attribute__((overloadable)) TTL_finish_buffering(
675 TTL_simplex_const_uchar_tensor_buffering_t *const simplex_buffering) {
678}
679/*
680 * TTL_simplex_scheme.h
681 *
682 * Copyright (c) 2025 Mobileye
683 *
684 * Licensed under the Apache License, Version 2.0 (the License);
685 * you may not use this file except in compliance with the License.
686 * You may obtain a copy of the License at
687 *
688 * http://www.apache.org/licenses/LICENSE-2.0
689 *
690 * Unless required by applicable law or agreed to in writing, software
691 * distributed under the License is distributed on an AS IS BASIS,
692 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
693 * See the License for the specific language governing permissions and
694 * limitations under the License.
695 */
696
697// clang-format off
698/**
699 * @file
700 *
701 * TTL_simplex_buffering pipelines a pair of import and export transactions using
702 * three internal buffers, in rotation: each buffer interchangeably serves as input
703 * buffer and output buffer, such that in each iteration one buffer is used both to
704 * export then import and two buffers are used by compute for reading and writing.
705 *
706 * With simplex buffering we're only waiting for previous iterations, so DMA
707 * transactions run mostly in parallel to computation, but serially with each
708 * other. Using the same buffer both for import and export is possible allowing us
709 * to overlap exporting from and importing to the same buffer.
710 *
711 * The following table draws the pipelined actions performed in simplex buffering.
712 * It specifies which tile is processed in each iteration:
713 *
714 * | Action\\Iteration | \#-1 | \#0 | \#1 | \#2 | \#i (2:NumOfTiles-2) | \#NumOfTiles-1 | \#NumOfTiles | \#NumOfTiles+1 |
715 * |-------------------|------|-----|-----|-----|----------------------|----------------|--------------|----------------|
716 * | **WaitExport** | | | | 0 | i-2 | NumOfTiles-3 | NumOfTiles-2 | NumOfTiles-1 |
717 * | **Export** | | | 0 | 1 | i-1 | NumOfTiles-2 | NumOfTiles-1 | |
718 * | **Wait Import** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
719 * | **Import** | 0 | 1 | 2 | 3 | i+1 | | | |
720 * | **Compute** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
721 *
722 * Notice the prolog (at iteration number -1) and the 2 epilogs (at iterations
723 * number NumOfTiles and NumOfTiles+1) which add in total 3 extra iterations.
724 *
725 * @example TTL_simplex_buffering.cl
726 */
727// clang-format on
728
729// This file presumes that the following have been pre included.
730// this is not done here for path reasons.
731// #include "TTL_core.h"
732// #include "TTL_import_export.h"
733// #include TTL_IMPORT_EXPORT_INCLUDE_H
734
735/**
736 * @def The structs used for this buffering type
737 */
738// TTL_simplex_buffering_t
739typedef struct {
740 struct {
741 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
742 0->1->0->1... etc */
743 __local int *int_base[3]; /*!< The internal base addresses of the pipelined tiles. */
744 TTL_ext_int_tensor_t ext_tensor_in; /*!< The external tensor being input */
745 TTL_ext_int_tensor_t ext_tensor_out; /*!< The external tensor being output */
746 } common; ///< The information that is common to all pipeline schemes
747
750 // Cache previous gotten tiles.
752 TTL_int_int_sub_tensor_t int_prev_imported; // Cache previously imported internal buffer.
754
755/**
756 * Simple declarations for file ordering purposes
757 */
758static inline TTL_io_int_tensor_t __attribute__((overloadable)) TTL_step_buffering(
759 TTL_simplex_const_int_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
760 TTL_tile_t tile_current_export);
761
762/**
763 * @brief Create a TTL_simplex_buffering_t and begin the buffering process
764 *
765 * @param int_base1 The address of the first buffer to be used in local memory
766 * @param int_base2 The address of the second buffer to be used in local memory
767 * @param int_base3 The address of the third buffer to be used in local memory
768 * @param ext_tensor_in The external tensor to import the input data from
769 * @param ext_tensor_out The external tensor to export the output data to
770 * @param event_in A pointer to the event to use for the inward (external to
771 * internal) transfer completion
772 * @param event_out A pointer to the event to use for the inward (internal to
773 * external) transfer completion
774 * @param first_tile The first tile to fetch for the scheme
775 *
776 * Solid description of TTL_double_double_buffering_t buffering here
777 *
778 * @return The TTL_simplex_buffering_t created from the input parameters
779 *
780 * Example:
781 * @code
782 * TTL_event_t tb_e_in = TTL_get_event();
783 * TTL_event_t tb_e_out = TTL_get_event();
784 * TTL_simplex_buffering_t tb_scheme = TTL_start_simplex_buffering(
785 * ext_base_in, ext_base_out, l_buff1, l_buff2, l_buff3, ext_layout_in,
786 * ext_layout_out, &tb_e_in, &tb_e_out);
787 * @endcode
788 * \n
789 *
790 * This can be optimized and standardized using the TTL_step_buffering
791 * call.
792 *
793 * @startuml
794 *
795 * start
796 *
797 *
798 * stop
799 *
800 * @enduml
801 *
802 */
804 __local int *int_base1, __local int *int_base2, __local int *int_base3, TTL_ext_int_tensor_t ext_tensor_in,
805 TTL_ext_int_tensor_t ext_tensor_out, TTL_event_t *event_in, TTL_event_t *event_out, TTL_tile_t first_tile) {
807
808 result.common.int_base[0] = int_base1;
809 result.common.int_base[1] = int_base2;
810 result.common.int_base[2] = int_base3;
811 result.common.ext_tensor_in = ext_tensor_in;
812 result.common.ext_tensor_out = ext_tensor_out;
813 result.event_in = event_in;
814 result.event_out = event_out;
816
817 result.common.index = 0;
818
820
821 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
822
823 return result;
824}
825
826static inline TTL_io_int_tensor_t __attribute__((overloadable)) TTL_step_buffering(
827 TTL_simplex_const_int_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
828 TTL_tile_t tile_current_export) {
829 // For performance, compute everything possible before waiting for the previous operations to finish. The current
830 // index contains the tile that is to be exported, so prepare the structures before beginning the export and export.
831 const TTL_layout_t next_import_layout =
832 TTL_create_layout(tile_next_import.shape.width, tile_next_import.shape.height);
833 const TTL_int_int_sub_tensor_t next_import_int_sub_tensor =
834 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
835 tile_next_import.shape,
836 next_import_layout,
837 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
838 tile_next_import.offset);
839 const TTL_const_ext_int_tensor_t next_import_ext_tensor =
841 tile_next_import.shape,
842 simplex_buffer->common.ext_tensor_in.layout,
843 tile_next_import.offset,
844 simplex_buffer->common.ext_tensor_in.elem_size);
845
846 const TTL_layout_t int_export_layout = TTL_create_layout(simplex_buffer->next_exported_tile.shape.width,
847 simplex_buffer->next_exported_tile.shape.height);
848 const TTL_int_int_tensor_t int_export_tensor =
849 TTL_create_int_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
850 simplex_buffer->next_exported_tile.shape,
851 int_export_layout,
852 simplex_buffer->common.ext_tensor_out.elem_size);
853 const TTL_ext_int_tensor_t export_to = TTL_create_ext_tensor(simplex_buffer->common.ext_tensor_out.base,
854 simplex_buffer->next_exported_tile.shape,
855 simplex_buffer->common.ext_tensor_out.layout,
856 simplex_buffer->next_exported_tile.offset,
857 simplex_buffer->common.ext_tensor_out.elem_size);
858
859 // Wait for the previous (import/export)s to complete before starting the next.
860 TTL_wait(1, simplex_buffer->event_out);
861 TTL_wait(1, simplex_buffer->event_in);
862
863 if (TTL_tile_empty(simplex_buffer->next_exported_tile) == false)
865 *TTL_to_void_tensor(&export_to),
866 simplex_buffer->event_out);
867
868 if (TTL_tile_empty(tile_next_import) == false)
869 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
870 *TTL_to_void_tensor(TTL_to_const_tensor(&next_import_ext_tensor)),
871 simplex_buffer->event_in);
872
873 // The import/export has been started for the current tile, Move to the next
874 // tile.
875 simplex_buffer->common.index =
876 (simplex_buffer->common.index + 1) %
877 (sizeof((simplex_buffer->common.int_base)) / sizeof((simplex_buffer->common.int_base)[0])); // Write to.
878
879 // Retrieve buffer imported previously to read from now.
880 const TTL_int_int_sub_tensor_t int_curr_buff_in = simplex_buffer->int_prev_imported;
881 simplex_buffer->int_prev_imported = next_import_int_sub_tensor;
882
883 // Can write to out buffer according to size of curr_tile, rather than size
884 // recently exported.
885 const TTL_layout_t curr_int_layout =
886 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.width);
887 const TTL_int_int_sub_tensor_t int_curr_buff_out =
888 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
889 tile_current_export.shape,
890 curr_int_layout,
891 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
892 tile_current_export.offset);
893
894 // Save last two tiles to prevent common repeated get_tile()'s.
895 simplex_buffer->next_exported_tile = tile_current_export;
896
897 return TTL_create_io_tensors(int_curr_buff_in, int_curr_buff_out);
898}
899
900static inline void __attribute__((overloadable)) TTL_finish_buffering(
901 TTL_simplex_const_int_tensor_buffering_t *const simplex_buffering) {
904}
905/*
906 * TTL_simplex_scheme.h
907 *
908 * Copyright (c) 2025 Mobileye
909 *
910 * Licensed under the Apache License, Version 2.0 (the License);
911 * you may not use this file except in compliance with the License.
912 * You may obtain a copy of the License at
913 *
914 * http://www.apache.org/licenses/LICENSE-2.0
915 *
916 * Unless required by applicable law or agreed to in writing, software
917 * distributed under the License is distributed on an AS IS BASIS,
918 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
919 * See the License for the specific language governing permissions and
920 * limitations under the License.
921 */
922
923// clang-format off
924/**
925 * @file
926 *
927 * TTL_simplex_buffering pipelines a pair of import and export transactions using
928 * three internal buffers, in rotation: each buffer interchangeably serves as input
929 * buffer and output buffer, such that in each iteration one buffer is used both to
930 * export then import and two buffers are used by compute for reading and writing.
931 *
932 * With simplex buffering we're only waiting for previous iterations, so DMA
933 * transactions run mostly in parallel to computation, but serially with each
934 * other. Using the same buffer both for import and export is possible allowing us
935 * to overlap exporting from and importing to the same buffer.
936 *
937 * The following table draws the pipelined actions performed in simplex buffering.
938 * It specifies which tile is processed in each iteration:
939 *
940 * | Action\\Iteration | \#-1 | \#0 | \#1 | \#2 | \#i (2:NumOfTiles-2) | \#NumOfTiles-1 | \#NumOfTiles | \#NumOfTiles+1 |
941 * |-------------------|------|-----|-----|-----|----------------------|----------------|--------------|----------------|
942 * | **WaitExport** | | | | 0 | i-2 | NumOfTiles-3 | NumOfTiles-2 | NumOfTiles-1 |
943 * | **Export** | | | 0 | 1 | i-1 | NumOfTiles-2 | NumOfTiles-1 | |
944 * | **Wait Import** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
945 * | **Import** | 0 | 1 | 2 | 3 | i+1 | | | |
946 * | **Compute** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
947 *
948 * Notice the prolog (at iteration number -1) and the 2 epilogs (at iterations
949 * number NumOfTiles and NumOfTiles+1) which add in total 3 extra iterations.
950 *
951 * @example TTL_simplex_buffering.cl
952 */
953// clang-format on
954
955// This file presumes that the following have been pre included.
956// this is not done here for path reasons.
957// #include "TTL_core.h"
958// #include "TTL_import_export.h"
959// #include TTL_IMPORT_EXPORT_INCLUDE_H
960
961/**
962 * @def The structs used for this buffering type
963 */
964// TTL_simplex_buffering_t
965typedef struct {
966 struct {
967 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
968 0->1->0->1... etc */
969 __local uint *int_base[3]; /*!< The internal base addresses of the pipelined tiles. */
970 TTL_ext_uint_tensor_t ext_tensor_in; /*!< The external tensor being input */
971 TTL_ext_uint_tensor_t ext_tensor_out; /*!< The external tensor being output */
972 } common; ///< The information that is common to all pipeline schemes
973
976 // Cache previous gotten tiles.
978 TTL_int_uint_sub_tensor_t int_prev_imported; // Cache previously imported internal buffer.
980
981/**
982 * Simple declarations for file ordering purposes
983 */
984static inline TTL_io_uint_tensor_t __attribute__((overloadable)) TTL_step_buffering(
985 TTL_simplex_const_uint_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
986 TTL_tile_t tile_current_export);
987
988/**
989 * @brief Create a TTL_simplex_buffering_t and begin the buffering process
990 *
991 * @param int_base1 The address of the first buffer to be used in local memory
992 * @param int_base2 The address of the second buffer to be used in local memory
993 * @param int_base3 The address of the third buffer to be used in local memory
994 * @param ext_tensor_in The external tensor to import the input data from
995 * @param ext_tensor_out The external tensor to export the output data to
996 * @param event_in A pointer to the event to use for the inward (external to
997 * internal) transfer completion
998 * @param event_out A pointer to the event to use for the inward (internal to
999 * external) transfer completion
1000 * @param first_tile The first tile to fetch for the scheme
1001 *
1002 * Solid description of TTL_double_double_buffering_t buffering here
1003 *
1004 * @return The TTL_simplex_buffering_t created from the input parameters
1005 *
1006 * Example:
1007 * @code
1008 * TTL_event_t tb_e_in = TTL_get_event();
1009 * TTL_event_t tb_e_out = TTL_get_event();
1010 * TTL_simplex_buffering_t tb_scheme = TTL_start_simplex_buffering(
1011 * ext_base_in, ext_base_out, l_buff1, l_buff2, l_buff3, ext_layout_in,
1012 * ext_layout_out, &tb_e_in, &tb_e_out);
1013 * @endcode
1014 * \n
1015 *
1016 * This can be optimized and standardized using the TTL_step_buffering
1017 * call.
1018 *
1019 * @startuml
1020 *
1021 * start
1022 *
1023 *
1024 * stop
1025 *
1026 * @enduml
1027 *
1028 */
1030 __local uint *int_base1, __local uint *int_base2, __local uint *int_base3, TTL_ext_uint_tensor_t ext_tensor_in,
1031 TTL_ext_uint_tensor_t ext_tensor_out, TTL_event_t *event_in, TTL_event_t *event_out, TTL_tile_t first_tile) {
1033
1034 result.common.int_base[0] = int_base1;
1035 result.common.int_base[1] = int_base2;
1036 result.common.int_base[2] = int_base3;
1037 result.common.ext_tensor_in = ext_tensor_in;
1038 result.common.ext_tensor_out = ext_tensor_out;
1039 result.event_in = event_in;
1040 result.event_out = event_out;
1042
1043 result.common.index = 0;
1044
1046
1047 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
1048
1049 return result;
1050}
1051
1052static inline TTL_io_uint_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1053 TTL_simplex_const_uint_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
1054 TTL_tile_t tile_current_export) {
1055 // For performance, compute everything possible before waiting for the previous operations to finish. The current
1056 // index contains the tile that is to be exported, so prepare the structures before beginning the export and export.
1057 const TTL_layout_t next_import_layout =
1058 TTL_create_layout(tile_next_import.shape.width, tile_next_import.shape.height);
1059 const TTL_int_uint_sub_tensor_t next_import_int_sub_tensor =
1060 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1061 tile_next_import.shape,
1062 next_import_layout,
1063 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
1064 tile_next_import.offset);
1065 const TTL_const_ext_uint_tensor_t next_import_ext_tensor =
1067 tile_next_import.shape,
1068 simplex_buffer->common.ext_tensor_in.layout,
1069 tile_next_import.offset,
1070 simplex_buffer->common.ext_tensor_in.elem_size);
1071
1072 const TTL_layout_t int_export_layout = TTL_create_layout(simplex_buffer->next_exported_tile.shape.width,
1073 simplex_buffer->next_exported_tile.shape.height);
1074 const TTL_int_uint_tensor_t int_export_tensor =
1075 TTL_create_int_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1076 simplex_buffer->next_exported_tile.shape,
1077 int_export_layout,
1078 simplex_buffer->common.ext_tensor_out.elem_size);
1079 const TTL_ext_uint_tensor_t export_to = TTL_create_ext_tensor(simplex_buffer->common.ext_tensor_out.base,
1080 simplex_buffer->next_exported_tile.shape,
1081 simplex_buffer->common.ext_tensor_out.layout,
1082 simplex_buffer->next_exported_tile.offset,
1083 simplex_buffer->common.ext_tensor_out.elem_size);
1084
1085 // Wait for the previous (import/export)s to complete before starting the next.
1086 TTL_wait(1, simplex_buffer->event_out);
1087 TTL_wait(1, simplex_buffer->event_in);
1088
1089 if (TTL_tile_empty(simplex_buffer->next_exported_tile) == false)
1091 *TTL_to_void_tensor(&export_to),
1092 simplex_buffer->event_out);
1093
1094 if (TTL_tile_empty(tile_next_import) == false)
1095 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
1096 *TTL_to_void_tensor(TTL_to_const_tensor(&next_import_ext_tensor)),
1097 simplex_buffer->event_in);
1098
1099 // The import/export has been started for the current tile, Move to the next
1100 // tile.
1101 simplex_buffer->common.index =
1102 (simplex_buffer->common.index + 1) %
1103 (sizeof((simplex_buffer->common.int_base)) / sizeof((simplex_buffer->common.int_base)[0])); // Write to.
1104
1105 // Retrieve buffer imported previously to read from now.
1106 const TTL_int_uint_sub_tensor_t int_curr_buff_in = simplex_buffer->int_prev_imported;
1107 simplex_buffer->int_prev_imported = next_import_int_sub_tensor;
1108
1109 // Can write to out buffer according to size of curr_tile, rather than size
1110 // recently exported.
1111 const TTL_layout_t curr_int_layout =
1112 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.width);
1113 const TTL_int_uint_sub_tensor_t int_curr_buff_out =
1114 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1115 tile_current_export.shape,
1116 curr_int_layout,
1117 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
1118 tile_current_export.offset);
1119
1120 // Save last two tiles to prevent common repeated get_tile()'s.
1121 simplex_buffer->next_exported_tile = tile_current_export;
1122
1123 return TTL_create_io_tensors(int_curr_buff_in, int_curr_buff_out);
1124}
1125
1126static inline void __attribute__((overloadable)) TTL_finish_buffering(
1127 TTL_simplex_const_uint_tensor_buffering_t *const simplex_buffering) {
1130}
1131/*
1132 * TTL_simplex_scheme.h
1133 *
1134 * Copyright (c) 2025 Mobileye
1135 *
1136 * Licensed under the Apache License, Version 2.0 (the License);
1137 * you may not use this file except in compliance with the License.
1138 * You may obtain a copy of the License at
1139 *
1140 * http://www.apache.org/licenses/LICENSE-2.0
1141 *
1142 * Unless required by applicable law or agreed to in writing, software
1143 * distributed under the License is distributed on an AS IS BASIS,
1144 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1145 * See the License for the specific language governing permissions and
1146 * limitations under the License.
1147 */
1148
1149// clang-format off
1150/**
1151 * @file
1152 *
1153 * TTL_simplex_buffering pipelines a pair of import and export transactions using
1154 * three internal buffers, in rotation: each buffer interchangeably serves as input
1155 * buffer and output buffer, such that in each iteration one buffer is used both to
1156 * export then import and two buffers are used by compute for reading and writing.
1157 *
1158 * With simplex buffering we're only waiting for previous iterations, so DMA
1159 * transactions run mostly in parallel to computation, but serially with each
1160 * other. Using the same buffer both for import and export is possible allowing us
1161 * to overlap exporting from and importing to the same buffer.
1162 *
1163 * The following table draws the pipelined actions performed in simplex buffering.
1164 * It specifies which tile is processed in each iteration:
1165 *
1166 * | Action\\Iteration | \#-1 | \#0 | \#1 | \#2 | \#i (2:NumOfTiles-2) | \#NumOfTiles-1 | \#NumOfTiles | \#NumOfTiles+1 |
1167 * |-------------------|------|-----|-----|-----|----------------------|----------------|--------------|----------------|
1168 * | **WaitExport** | | | | 0 | i-2 | NumOfTiles-3 | NumOfTiles-2 | NumOfTiles-1 |
1169 * | **Export** | | | 0 | 1 | i-1 | NumOfTiles-2 | NumOfTiles-1 | |
1170 * | **Wait Import** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
1171 * | **Import** | 0 | 1 | 2 | 3 | i+1 | | | |
1172 * | **Compute** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
1173 *
1174 * Notice the prolog (at iteration number -1) and the 2 epilogs (at iterations
1175 * number NumOfTiles and NumOfTiles+1) which add in total 3 extra iterations.
1176 *
1177 * @example TTL_simplex_buffering.cl
1178 */
1179// clang-format on
1180
1181// This file presumes that the following have been pre included.
1182// this is not done here for path reasons.
1183// #include "TTL_core.h"
1184// #include "TTL_import_export.h"
1185// #include TTL_IMPORT_EXPORT_INCLUDE_H
1186
1187/**
1188 * @def The structs used for this buffering type
1189 */
1190// TTL_simplex_buffering_t
1191typedef struct {
1192 struct {
1193 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
1194 0->1->0->1... etc */
1195 __local short *int_base[3]; /*!< The internal base addresses of the pipelined tiles. */
1196 TTL_ext_short_tensor_t ext_tensor_in; /*!< The external tensor being input */
1197 TTL_ext_short_tensor_t ext_tensor_out; /*!< The external tensor being output */
1198 } common; ///< The information that is common to all pipeline schemes
1199
1202 // Cache previous gotten tiles.
1204 TTL_int_short_sub_tensor_t int_prev_imported; // Cache previously imported internal buffer.
1206
1207/**
1208 * Simple declarations for file ordering purposes
1209 */
1210static inline TTL_io_short_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1211 TTL_simplex_const_short_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
1212 TTL_tile_t tile_current_export);
1213
1214/**
1215 * @brief Create a TTL_simplex_buffering_t and begin the buffering process
1216 *
1217 * @param int_base1 The address of the first buffer to be used in local memory
1218 * @param int_base2 The address of the second buffer to be used in local memory
1219 * @param int_base3 The address of the third buffer to be used in local memory
1220 * @param ext_tensor_in The external tensor to import the input data from
1221 * @param ext_tensor_out The external tensor to export the output data to
1222 * @param event_in A pointer to the event to use for the inward (external to
1223 * internal) transfer completion
1224 * @param event_out A pointer to the event to use for the inward (internal to
1225 * external) transfer completion
1226 * @param first_tile The first tile to fetch for the scheme
1227 *
1228 * Solid description of TTL_double_double_buffering_t buffering here
1229 *
1230 * @return The TTL_simplex_buffering_t created from the input parameters
1231 *
1232 * Example:
1233 * @code
1234 * TTL_event_t tb_e_in = TTL_get_event();
1235 * TTL_event_t tb_e_out = TTL_get_event();
1236 * TTL_simplex_buffering_t tb_scheme = TTL_start_simplex_buffering(
1237 * ext_base_in, ext_base_out, l_buff1, l_buff2, l_buff3, ext_layout_in,
1238 * ext_layout_out, &tb_e_in, &tb_e_out);
1239 * @endcode
1240 * \n
1241 *
1242 * This can be optimized and standardized using the TTL_step_buffering
1243 * call.
1244 *
1245 * @startuml
1246 *
1247 * start
1248 *
1249 *
1250 * stop
1251 *
1252 * @enduml
1253 *
1254 */
1256 __local short *int_base1, __local short *int_base2, __local short *int_base3, TTL_ext_short_tensor_t ext_tensor_in,
1257 TTL_ext_short_tensor_t ext_tensor_out, TTL_event_t *event_in, TTL_event_t *event_out, TTL_tile_t first_tile) {
1259
1260 result.common.int_base[0] = int_base1;
1261 result.common.int_base[1] = int_base2;
1262 result.common.int_base[2] = int_base3;
1263 result.common.ext_tensor_in = ext_tensor_in;
1264 result.common.ext_tensor_out = ext_tensor_out;
1265 result.event_in = event_in;
1266 result.event_out = event_out;
1268
1269 result.common.index = 0;
1270
1272
1273 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
1274
1275 return result;
1276}
1277
1278static inline TTL_io_short_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1279 TTL_simplex_const_short_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
1280 TTL_tile_t tile_current_export) {
1281 // For performance, compute everything possible before waiting for the previous operations to finish. The current
1282 // index contains the tile that is to be exported, so prepare the structures before beginning the export and export.
1283 const TTL_layout_t next_import_layout =
1284 TTL_create_layout(tile_next_import.shape.width, tile_next_import.shape.height);
1285 const TTL_int_short_sub_tensor_t next_import_int_sub_tensor =
1286 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1287 tile_next_import.shape,
1288 next_import_layout,
1289 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
1290 tile_next_import.offset);
1291 const TTL_const_ext_short_tensor_t next_import_ext_tensor =
1293 tile_next_import.shape,
1294 simplex_buffer->common.ext_tensor_in.layout,
1295 tile_next_import.offset,
1296 simplex_buffer->common.ext_tensor_in.elem_size);
1297
1298 const TTL_layout_t int_export_layout = TTL_create_layout(simplex_buffer->next_exported_tile.shape.width,
1299 simplex_buffer->next_exported_tile.shape.height);
1300 const TTL_int_short_tensor_t int_export_tensor =
1301 TTL_create_int_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1302 simplex_buffer->next_exported_tile.shape,
1303 int_export_layout,
1304 simplex_buffer->common.ext_tensor_out.elem_size);
1305 const TTL_ext_short_tensor_t export_to = TTL_create_ext_tensor(simplex_buffer->common.ext_tensor_out.base,
1306 simplex_buffer->next_exported_tile.shape,
1307 simplex_buffer->common.ext_tensor_out.layout,
1308 simplex_buffer->next_exported_tile.offset,
1309 simplex_buffer->common.ext_tensor_out.elem_size);
1310
1311 // Wait for the previous (import/export)s to complete before starting the next.
1312 TTL_wait(1, simplex_buffer->event_out);
1313 TTL_wait(1, simplex_buffer->event_in);
1314
1315 if (TTL_tile_empty(simplex_buffer->next_exported_tile) == false)
1317 *TTL_to_void_tensor(&export_to),
1318 simplex_buffer->event_out);
1319
1320 if (TTL_tile_empty(tile_next_import) == false)
1321 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
1322 *TTL_to_void_tensor(TTL_to_const_tensor(&next_import_ext_tensor)),
1323 simplex_buffer->event_in);
1324
1325 // The import/export has been started for the current tile, Move to the next
1326 // tile.
1327 simplex_buffer->common.index =
1328 (simplex_buffer->common.index + 1) %
1329 (sizeof((simplex_buffer->common.int_base)) / sizeof((simplex_buffer->common.int_base)[0])); // Write to.
1330
1331 // Retrieve buffer imported previously to read from now.
1332 const TTL_int_short_sub_tensor_t int_curr_buff_in = simplex_buffer->int_prev_imported;
1333 simplex_buffer->int_prev_imported = next_import_int_sub_tensor;
1334
1335 // Can write to out buffer according to size of curr_tile, rather than size
1336 // recently exported.
1337 const TTL_layout_t curr_int_layout =
1338 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.width);
1339 const TTL_int_short_sub_tensor_t int_curr_buff_out =
1340 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1341 tile_current_export.shape,
1342 curr_int_layout,
1343 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
1344 tile_current_export.offset);
1345
1346 // Save last two tiles to prevent common repeated get_tile()'s.
1347 simplex_buffer->next_exported_tile = tile_current_export;
1348
1349 return TTL_create_io_tensors(int_curr_buff_in, int_curr_buff_out);
1350}
1351
1352static inline void __attribute__((overloadable)) TTL_finish_buffering(
1353 TTL_simplex_const_short_tensor_buffering_t *const simplex_buffering) {
1356}
1357/*
1358 * TTL_simplex_scheme.h
1359 *
1360 * Copyright (c) 2025 Mobileye
1361 *
1362 * Licensed under the Apache License, Version 2.0 (the License);
1363 * you may not use this file except in compliance with the License.
1364 * You may obtain a copy of the License at
1365 *
1366 * http://www.apache.org/licenses/LICENSE-2.0
1367 *
1368 * Unless required by applicable law or agreed to in writing, software
1369 * distributed under the License is distributed on an AS IS BASIS,
1370 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1371 * See the License for the specific language governing permissions and
1372 * limitations under the License.
1373 */
1374
1375// clang-format off
1376/**
1377 * @file
1378 *
1379 * TTL_simplex_buffering pipelines a pair of import and export transactions using
1380 * three internal buffers, in rotation: each buffer interchangeably serves as input
1381 * buffer and output buffer, such that in each iteration one buffer is used both to
1382 * export then import and two buffers are used by compute for reading and writing.
1383 *
1384 * With simplex buffering we're only waiting for previous iterations, so DMA
1385 * transactions run mostly in parallel to computation, but serially with each
1386 * other. Using the same buffer both for import and export is possible allowing us
1387 * to overlap exporting from and importing to the same buffer.
1388 *
1389 * The following table draws the pipelined actions performed in simplex buffering.
1390 * It specifies which tile is processed in each iteration:
1391 *
1392 * | Action\\Iteration | \#-1 | \#0 | \#1 | \#2 | \#i (2:NumOfTiles-2) | \#NumOfTiles-1 | \#NumOfTiles | \#NumOfTiles+1 |
1393 * |-------------------|------|-----|-----|-----|----------------------|----------------|--------------|----------------|
1394 * | **WaitExport** | | | | 0 | i-2 | NumOfTiles-3 | NumOfTiles-2 | NumOfTiles-1 |
1395 * | **Export** | | | 0 | 1 | i-1 | NumOfTiles-2 | NumOfTiles-1 | |
1396 * | **Wait Import** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
1397 * | **Import** | 0 | 1 | 2 | 3 | i+1 | | | |
1398 * | **Compute** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
1399 *
1400 * Notice the prolog (at iteration number -1) and the 2 epilogs (at iterations
1401 * number NumOfTiles and NumOfTiles+1) which add in total 3 extra iterations.
1402 *
1403 * @example TTL_simplex_buffering.cl
1404 */
1405// clang-format on
1406
1407// This file presumes that the following have been pre included.
1408// this is not done here for path reasons.
1409// #include "TTL_core.h"
1410// #include "TTL_import_export.h"
1411// #include TTL_IMPORT_EXPORT_INCLUDE_H
1412
1413/**
1414 * @def The structs used for this buffering type
1415 */
1416// TTL_simplex_buffering_t
1417typedef struct {
1418 struct {
1419 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
1420 0->1->0->1... etc */
1421 __local ushort *int_base[3]; /*!< The internal base addresses of the pipelined tiles. */
1422 TTL_ext_ushort_tensor_t ext_tensor_in; /*!< The external tensor being input */
1423 TTL_ext_ushort_tensor_t ext_tensor_out; /*!< The external tensor being output */
1424 } common; ///< The information that is common to all pipeline schemes
1425
1428 // Cache previous gotten tiles.
1430 TTL_int_ushort_sub_tensor_t int_prev_imported; // Cache previously imported internal buffer.
1432
1433/**
1434 * Simple declarations for file ordering purposes
1435 */
1436static inline TTL_io_ushort_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1437 TTL_simplex_const_ushort_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
1438 TTL_tile_t tile_current_export);
1439
1440/**
1441 * @brief Create a TTL_simplex_buffering_t and begin the buffering process
1442 *
1443 * @param int_base1 The address of the first buffer to be used in local memory
1444 * @param int_base2 The address of the second buffer to be used in local memory
1445 * @param int_base3 The address of the third buffer to be used in local memory
1446 * @param ext_tensor_in The external tensor to import the input data from
1447 * @param ext_tensor_out The external tensor to export the output data to
1448 * @param event_in A pointer to the event to use for the inward (external to
1449 * internal) transfer completion
1450 * @param event_out A pointer to the event to use for the inward (internal to
1451 * external) transfer completion
1452 * @param first_tile The first tile to fetch for the scheme
1453 *
1454 * Solid description of TTL_double_double_buffering_t buffering here
1455 *
1456 * @return The TTL_simplex_buffering_t created from the input parameters
1457 *
1458 * Example:
1459 * @code
1460 * TTL_event_t tb_e_in = TTL_get_event();
1461 * TTL_event_t tb_e_out = TTL_get_event();
1462 * TTL_simplex_buffering_t tb_scheme = TTL_start_simplex_buffering(
1463 * ext_base_in, ext_base_out, l_buff1, l_buff2, l_buff3, ext_layout_in,
1464 * ext_layout_out, &tb_e_in, &tb_e_out);
1465 * @endcode
1466 * \n
1467 *
1468 * This can be optimized and standardized using the TTL_step_buffering
1469 * call.
1470 *
1471 * @startuml
1472 *
1473 * start
1474 *
1475 *
1476 * stop
1477 *
1478 * @enduml
1479 *
1480 */
1482 __local ushort *int_base1, __local ushort *int_base2, __local ushort *int_base3,
1483 TTL_ext_ushort_tensor_t ext_tensor_in, TTL_ext_ushort_tensor_t ext_tensor_out, TTL_event_t *event_in,
1484 TTL_event_t *event_out, TTL_tile_t first_tile) {
1486
1487 result.common.int_base[0] = int_base1;
1488 result.common.int_base[1] = int_base2;
1489 result.common.int_base[2] = int_base3;
1490 result.common.ext_tensor_in = ext_tensor_in;
1491 result.common.ext_tensor_out = ext_tensor_out;
1492 result.event_in = event_in;
1493 result.event_out = event_out;
1495
1496 result.common.index = 0;
1497
1499
1500 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
1501
1502 return result;
1503}
1504
1505static inline TTL_io_ushort_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1506 TTL_simplex_const_ushort_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
1507 TTL_tile_t tile_current_export) {
1508 // For performance, compute everything possible before waiting for the previous operations to finish. The current
1509 // index contains the tile that is to be exported, so prepare the structures before beginning the export and export.
1510 const TTL_layout_t next_import_layout =
1511 TTL_create_layout(tile_next_import.shape.width, tile_next_import.shape.height);
1512 const TTL_int_ushort_sub_tensor_t next_import_int_sub_tensor =
1513 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1514 tile_next_import.shape,
1515 next_import_layout,
1516 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
1517 tile_next_import.offset);
1518 const TTL_const_ext_ushort_tensor_t next_import_ext_tensor =
1520 tile_next_import.shape,
1521 simplex_buffer->common.ext_tensor_in.layout,
1522 tile_next_import.offset,
1523 simplex_buffer->common.ext_tensor_in.elem_size);
1524
1525 const TTL_layout_t int_export_layout = TTL_create_layout(simplex_buffer->next_exported_tile.shape.width,
1526 simplex_buffer->next_exported_tile.shape.height);
1527 const TTL_int_ushort_tensor_t int_export_tensor =
1528 TTL_create_int_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1529 simplex_buffer->next_exported_tile.shape,
1530 int_export_layout,
1531 simplex_buffer->common.ext_tensor_out.elem_size);
1533 simplex_buffer->next_exported_tile.shape,
1534 simplex_buffer->common.ext_tensor_out.layout,
1535 simplex_buffer->next_exported_tile.offset,
1536 simplex_buffer->common.ext_tensor_out.elem_size);
1537
1538 // Wait for the previous (import/export)s to complete before starting the next.
1539 TTL_wait(1, simplex_buffer->event_out);
1540 TTL_wait(1, simplex_buffer->event_in);
1541
1542 if (TTL_tile_empty(simplex_buffer->next_exported_tile) == false)
1544 *TTL_to_void_tensor(&export_to),
1545 simplex_buffer->event_out);
1546
1547 if (TTL_tile_empty(tile_next_import) == false)
1548 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
1549 *TTL_to_void_tensor(TTL_to_const_tensor(&next_import_ext_tensor)),
1550 simplex_buffer->event_in);
1551
1552 // The import/export has been started for the current tile, Move to the next
1553 // tile.
1554 simplex_buffer->common.index =
1555 (simplex_buffer->common.index + 1) %
1556 (sizeof((simplex_buffer->common.int_base)) / sizeof((simplex_buffer->common.int_base)[0])); // Write to.
1557
1558 // Retrieve buffer imported previously to read from now.
1559 const TTL_int_ushort_sub_tensor_t int_curr_buff_in = simplex_buffer->int_prev_imported;
1560 simplex_buffer->int_prev_imported = next_import_int_sub_tensor;
1561
1562 // Can write to out buffer according to size of curr_tile, rather than size
1563 // recently exported.
1564 const TTL_layout_t curr_int_layout =
1565 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.width);
1566 const TTL_int_ushort_sub_tensor_t int_curr_buff_out =
1567 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1568 tile_current_export.shape,
1569 curr_int_layout,
1570 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
1571 tile_current_export.offset);
1572
1573 // Save last two tiles to prevent common repeated get_tile()'s.
1574 simplex_buffer->next_exported_tile = tile_current_export;
1575
1576 return TTL_create_io_tensors(int_curr_buff_in, int_curr_buff_out);
1577}
1578
1579static inline void __attribute__((overloadable)) TTL_finish_buffering(
1580 TTL_simplex_const_ushort_tensor_buffering_t *const simplex_buffering) {
1583}
1584/*
1585 * TTL_simplex_scheme.h
1586 *
1587 * Copyright (c) 2025 Mobileye
1588 *
1589 * Licensed under the Apache License, Version 2.0 (the License);
1590 * you may not use this file except in compliance with the License.
1591 * You may obtain a copy of the License at
1592 *
1593 * http://www.apache.org/licenses/LICENSE-2.0
1594 *
1595 * Unless required by applicable law or agreed to in writing, software
1596 * distributed under the License is distributed on an AS IS BASIS,
1597 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1598 * See the License for the specific language governing permissions and
1599 * limitations under the License.
1600 */
1601
1602// clang-format off
1603/**
1604 * @file
1605 *
1606 * TTL_simplex_buffering pipelines a pair of import and export transactions using
1607 * three internal buffers, in rotation: each buffer interchangeably serves as input
1608 * buffer and output buffer, such that in each iteration one buffer is used both to
1609 * export then import and two buffers are used by compute for reading and writing.
1610 *
1611 * With simplex buffering we're only waiting for previous iterations, so DMA
1612 * transactions run mostly in parallel to computation, but serially with each
1613 * other. Using the same buffer both for import and export is possible allowing us
1614 * to overlap exporting from and importing to the same buffer.
1615 *
1616 * The following table draws the pipelined actions performed in simplex buffering.
1617 * It specifies which tile is processed in each iteration:
1618 *
1619 * | Action\\Iteration | \#-1 | \#0 | \#1 | \#2 | \#i (2:NumOfTiles-2) | \#NumOfTiles-1 | \#NumOfTiles | \#NumOfTiles+1 |
1620 * |-------------------|------|-----|-----|-----|----------------------|----------------|--------------|----------------|
1621 * | **WaitExport** | | | | 0 | i-2 | NumOfTiles-3 | NumOfTiles-2 | NumOfTiles-1 |
1622 * | **Export** | | | 0 | 1 | i-1 | NumOfTiles-2 | NumOfTiles-1 | |
1623 * | **Wait Import** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
1624 * | **Import** | 0 | 1 | 2 | 3 | i+1 | | | |
1625 * | **Compute** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
1626 *
1627 * Notice the prolog (at iteration number -1) and the 2 epilogs (at iterations
1628 * number NumOfTiles and NumOfTiles+1) which add in total 3 extra iterations.
1629 *
1630 * @example TTL_simplex_buffering.cl
1631 */
1632// clang-format on
1633
1634// This file presumes that the following have been pre included.
1635// this is not done here for path reasons.
1636// #include "TTL_core.h"
1637// #include "TTL_import_export.h"
1638// #include TTL_IMPORT_EXPORT_INCLUDE_H
1639
1640/**
1641 * @def The structs used for this buffering type
1642 */
1643// TTL_simplex_buffering_t
1644typedef struct {
1645 struct {
1646 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
1647 0->1->0->1... etc */
1648 __local long *int_base[3]; /*!< The internal base addresses of the pipelined tiles. */
1649 TTL_ext_long_tensor_t ext_tensor_in; /*!< The external tensor being input */
1650 TTL_ext_long_tensor_t ext_tensor_out; /*!< The external tensor being output */
1651 } common; ///< The information that is common to all pipeline schemes
1652
1655 // Cache previous gotten tiles.
1657 TTL_int_long_sub_tensor_t int_prev_imported; // Cache previously imported internal buffer.
1659
1660/**
1661 * Simple declarations for file ordering purposes
1662 */
1663static inline TTL_io_long_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1664 TTL_simplex_const_long_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
1665 TTL_tile_t tile_current_export);
1666
1667/**
1668 * @brief Create a TTL_simplex_buffering_t and begin the buffering process
1669 *
1670 * @param int_base1 The address of the first buffer to be used in local memory
1671 * @param int_base2 The address of the second buffer to be used in local memory
1672 * @param int_base3 The address of the third buffer to be used in local memory
1673 * @param ext_tensor_in The external tensor to import the input data from
1674 * @param ext_tensor_out The external tensor to export the output data to
1675 * @param event_in A pointer to the event to use for the inward (external to
1676 * internal) transfer completion
1677 * @param event_out A pointer to the event to use for the inward (internal to
1678 * external) transfer completion
1679 * @param first_tile The first tile to fetch for the scheme
1680 *
1681 * Solid description of TTL_double_double_buffering_t buffering here
1682 *
1683 * @return The TTL_simplex_buffering_t created from the input parameters
1684 *
1685 * Example:
1686 * @code
1687 * TTL_event_t tb_e_in = TTL_get_event();
1688 * TTL_event_t tb_e_out = TTL_get_event();
1689 * TTL_simplex_buffering_t tb_scheme = TTL_start_simplex_buffering(
1690 * ext_base_in, ext_base_out, l_buff1, l_buff2, l_buff3, ext_layout_in,
1691 * ext_layout_out, &tb_e_in, &tb_e_out);
1692 * @endcode
1693 * \n
1694 *
1695 * This can be optimized and standardized using the TTL_step_buffering
1696 * call.
1697 *
1698 * @startuml
1699 *
1700 * start
1701 *
1702 *
1703 * stop
1704 *
1705 * @enduml
1706 *
1707 */
1709 __local long *int_base1, __local long *int_base2, __local long *int_base3, TTL_ext_long_tensor_t ext_tensor_in,
1710 TTL_ext_long_tensor_t ext_tensor_out, TTL_event_t *event_in, TTL_event_t *event_out, TTL_tile_t first_tile) {
1712
1713 result.common.int_base[0] = int_base1;
1714 result.common.int_base[1] = int_base2;
1715 result.common.int_base[2] = int_base3;
1716 result.common.ext_tensor_in = ext_tensor_in;
1717 result.common.ext_tensor_out = ext_tensor_out;
1718 result.event_in = event_in;
1719 result.event_out = event_out;
1721
1722 result.common.index = 0;
1723
1725
1726 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
1727
1728 return result;
1729}
1730
1731static inline TTL_io_long_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1732 TTL_simplex_const_long_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
1733 TTL_tile_t tile_current_export) {
1734 // For performance, compute everything possible before waiting for the previous operations to finish. The current
1735 // index contains the tile that is to be exported, so prepare the structures before beginning the export and export.
1736 const TTL_layout_t next_import_layout =
1737 TTL_create_layout(tile_next_import.shape.width, tile_next_import.shape.height);
1738 const TTL_int_long_sub_tensor_t next_import_int_sub_tensor =
1739 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1740 tile_next_import.shape,
1741 next_import_layout,
1742 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
1743 tile_next_import.offset);
1744 const TTL_const_ext_long_tensor_t next_import_ext_tensor =
1746 tile_next_import.shape,
1747 simplex_buffer->common.ext_tensor_in.layout,
1748 tile_next_import.offset,
1749 simplex_buffer->common.ext_tensor_in.elem_size);
1750
1751 const TTL_layout_t int_export_layout = TTL_create_layout(simplex_buffer->next_exported_tile.shape.width,
1752 simplex_buffer->next_exported_tile.shape.height);
1753 const TTL_int_long_tensor_t int_export_tensor =
1754 TTL_create_int_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1755 simplex_buffer->next_exported_tile.shape,
1756 int_export_layout,
1757 simplex_buffer->common.ext_tensor_out.elem_size);
1758 const TTL_ext_long_tensor_t export_to = TTL_create_ext_tensor(simplex_buffer->common.ext_tensor_out.base,
1759 simplex_buffer->next_exported_tile.shape,
1760 simplex_buffer->common.ext_tensor_out.layout,
1761 simplex_buffer->next_exported_tile.offset,
1762 simplex_buffer->common.ext_tensor_out.elem_size);
1763
1764 // Wait for the previous (import/export)s to complete before starting the next.
1765 TTL_wait(1, simplex_buffer->event_out);
1766 TTL_wait(1, simplex_buffer->event_in);
1767
1768 if (TTL_tile_empty(simplex_buffer->next_exported_tile) == false)
1770 *TTL_to_void_tensor(&export_to),
1771 simplex_buffer->event_out);
1772
1773 if (TTL_tile_empty(tile_next_import) == false)
1774 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
1775 *TTL_to_void_tensor(TTL_to_const_tensor(&next_import_ext_tensor)),
1776 simplex_buffer->event_in);
1777
1778 // The import/export has been started for the current tile, Move to the next
1779 // tile.
1780 simplex_buffer->common.index =
1781 (simplex_buffer->common.index + 1) %
1782 (sizeof((simplex_buffer->common.int_base)) / sizeof((simplex_buffer->common.int_base)[0])); // Write to.
1783
1784 // Retrieve buffer imported previously to read from now.
1785 const TTL_int_long_sub_tensor_t int_curr_buff_in = simplex_buffer->int_prev_imported;
1786 simplex_buffer->int_prev_imported = next_import_int_sub_tensor;
1787
1788 // Can write to out buffer according to size of curr_tile, rather than size
1789 // recently exported.
1790 const TTL_layout_t curr_int_layout =
1791 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.width);
1792 const TTL_int_long_sub_tensor_t int_curr_buff_out =
1793 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1794 tile_current_export.shape,
1795 curr_int_layout,
1796 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
1797 tile_current_export.offset);
1798
1799 // Save last two tiles to prevent common repeated get_tile()'s.
1800 simplex_buffer->next_exported_tile = tile_current_export;
1801
1802 return TTL_create_io_tensors(int_curr_buff_in, int_curr_buff_out);
1803}
1804
1805static inline void __attribute__((overloadable)) TTL_finish_buffering(
1806 TTL_simplex_const_long_tensor_buffering_t *const simplex_buffering) {
1809}
1810/*
1811 * TTL_simplex_scheme.h
1812 *
1813 * Copyright (c) 2025 Mobileye
1814 *
1815 * Licensed under the Apache License, Version 2.0 (the License);
1816 * you may not use this file except in compliance with the License.
1817 * You may obtain a copy of the License at
1818 *
1819 * http://www.apache.org/licenses/LICENSE-2.0
1820 *
1821 * Unless required by applicable law or agreed to in writing, software
1822 * distributed under the License is distributed on an AS IS BASIS,
1823 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1824 * See the License for the specific language governing permissions and
1825 * limitations under the License.
1826 */
1827
1828// clang-format off
1829/**
1830 * @file
1831 *
1832 * TTL_simplex_buffering pipelines a pair of import and export transactions using
1833 * three internal buffers, in rotation: each buffer interchangeably serves as input
1834 * buffer and output buffer, such that in each iteration one buffer is used both to
1835 * export then import and two buffers are used by compute for reading and writing.
1836 *
1837 * With simplex buffering we're only waiting for previous iterations, so DMA
1838 * transactions run mostly in parallel to computation, but serially with each
1839 * other. Using the same buffer both for import and export is possible allowing us
1840 * to overlap exporting from and importing to the same buffer.
1841 *
1842 * The following table draws the pipelined actions performed in simplex buffering.
1843 * It specifies which tile is processed in each iteration:
1844 *
1845 * | Action\\Iteration | \#-1 | \#0 | \#1 | \#2 | \#i (2:NumOfTiles-2) | \#NumOfTiles-1 | \#NumOfTiles | \#NumOfTiles+1 |
1846 * |-------------------|------|-----|-----|-----|----------------------|----------------|--------------|----------------|
1847 * | **WaitExport** | | | | 0 | i-2 | NumOfTiles-3 | NumOfTiles-2 | NumOfTiles-1 |
1848 * | **Export** | | | 0 | 1 | i-1 | NumOfTiles-2 | NumOfTiles-1 | |
1849 * | **Wait Import** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
1850 * | **Import** | 0 | 1 | 2 | 3 | i+1 | | | |
1851 * | **Compute** | | 0 | 1 | 2 | i | NumOfTiles-1 | | |
1852 *
1853 * Notice the prolog (at iteration number -1) and the 2 epilogs (at iterations
1854 * number NumOfTiles and NumOfTiles+1) which add in total 3 extra iterations.
1855 *
1856 * @example TTL_simplex_buffering.cl
1857 */
1858// clang-format on
1859
1860// This file presumes that the following have been pre included.
1861// this is not done here for path reasons.
1862// #include "TTL_core.h"
1863// #include "TTL_import_export.h"
1864// #include TTL_IMPORT_EXPORT_INCLUDE_H
1865
1866/**
1867 * @def The structs used for this buffering type
1868 */
1869// TTL_simplex_buffering_t
1870typedef struct {
1871 struct {
1872 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
1873 0->1->0->1... etc */
1874 __local ulong *int_base[3]; /*!< The internal base addresses of the pipelined tiles. */
1875 TTL_ext_ulong_tensor_t ext_tensor_in; /*!< The external tensor being input */
1876 TTL_ext_ulong_tensor_t ext_tensor_out; /*!< The external tensor being output */
1877 } common; ///< The information that is common to all pipeline schemes
1878
1881 // Cache previous gotten tiles.
1883 TTL_int_ulong_sub_tensor_t int_prev_imported; // Cache previously imported internal buffer.
1885
1886/**
1887 * Simple declarations for file ordering purposes
1888 */
1889static inline TTL_io_ulong_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1890 TTL_simplex_const_ulong_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
1891 TTL_tile_t tile_current_export);
1892
1893/**
1894 * @brief Create a TTL_simplex_buffering_t and begin the buffering process
1895 *
1896 * @param int_base1 The address of the first buffer to be used in local memory
1897 * @param int_base2 The address of the second buffer to be used in local memory
1898 * @param int_base3 The address of the third buffer to be used in local memory
1899 * @param ext_tensor_in The external tensor to import the input data from
1900 * @param ext_tensor_out The external tensor to export the output data to
1901 * @param event_in A pointer to the event to use for the inward (external to
1902 * internal) transfer completion
1903 * @param event_out A pointer to the event to use for the inward (internal to
1904 * external) transfer completion
1905 * @param first_tile The first tile to fetch for the scheme
1906 *
1907 * Solid description of TTL_double_double_buffering_t buffering here
1908 *
1909 * @return The TTL_simplex_buffering_t created from the input parameters
1910 *
1911 * Example:
1912 * @code
1913 * TTL_event_t tb_e_in = TTL_get_event();
1914 * TTL_event_t tb_e_out = TTL_get_event();
1915 * TTL_simplex_buffering_t tb_scheme = TTL_start_simplex_buffering(
1916 * ext_base_in, ext_base_out, l_buff1, l_buff2, l_buff3, ext_layout_in,
1917 * ext_layout_out, &tb_e_in, &tb_e_out);
1918 * @endcode
1919 * \n
1920 *
1921 * This can be optimized and standardized using the TTL_step_buffering
1922 * call.
1923 *
1924 * @startuml
1925 *
1926 * start
1927 *
1928 *
1929 * stop
1930 *
1931 * @enduml
1932 *
1933 */
1935 __local ulong *int_base1, __local ulong *int_base2, __local ulong *int_base3, TTL_ext_ulong_tensor_t ext_tensor_in,
1936 TTL_ext_ulong_tensor_t ext_tensor_out, TTL_event_t *event_in, TTL_event_t *event_out, TTL_tile_t first_tile) {
1938
1939 result.common.int_base[0] = int_base1;
1940 result.common.int_base[1] = int_base2;
1941 result.common.int_base[2] = int_base3;
1942 result.common.ext_tensor_in = ext_tensor_in;
1943 result.common.ext_tensor_out = ext_tensor_out;
1944 result.event_in = event_in;
1945 result.event_out = event_out;
1947
1948 result.common.index = 0;
1949
1951
1952 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
1953
1954 return result;
1955}
1956
1957static inline TTL_io_ulong_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1958 TTL_simplex_const_ulong_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import,
1959 TTL_tile_t tile_current_export) {
1960 // For performance, compute everything possible before waiting for the previous operations to finish. The current
1961 // index contains the tile that is to be exported, so prepare the structures before beginning the export and export.
1962 const TTL_layout_t next_import_layout =
1963 TTL_create_layout(tile_next_import.shape.width, tile_next_import.shape.height);
1964 const TTL_int_ulong_sub_tensor_t next_import_int_sub_tensor =
1965 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1966 tile_next_import.shape,
1967 next_import_layout,
1968 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
1969 tile_next_import.offset);
1970 const TTL_const_ext_ulong_tensor_t next_import_ext_tensor =
1972 tile_next_import.shape,
1973 simplex_buffer->common.ext_tensor_in.layout,
1974 tile_next_import.offset,
1975 simplex_buffer->common.ext_tensor_in.elem_size);
1976
1977 const TTL_layout_t int_export_layout = TTL_create_layout(simplex_buffer->next_exported_tile.shape.width,
1978 simplex_buffer->next_exported_tile.shape.height);
1979 const TTL_int_ulong_tensor_t int_export_tensor =
1980 TTL_create_int_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
1981 simplex_buffer->next_exported_tile.shape,
1982 int_export_layout,
1983 simplex_buffer->common.ext_tensor_out.elem_size);
1984 const TTL_ext_ulong_tensor_t export_to = TTL_create_ext_tensor(simplex_buffer->common.ext_tensor_out.base,
1985 simplex_buffer->next_exported_tile.shape,
1986 simplex_buffer->common.ext_tensor_out.layout,
1987 simplex_buffer->next_exported_tile.offset,
1988 simplex_buffer->common.ext_tensor_out.elem_size);
1989
1990 // Wait for the previous (import/export)s to complete before starting the next.
1991 TTL_wait(1, simplex_buffer->event_out);
1992 TTL_wait(1, simplex_buffer->event_in);
1993
1994 if (TTL_tile_empty(simplex_buffer->next_exported_tile) == false)
1996 *TTL_to_void_tensor(&export_to),
1997 simplex_buffer->event_out);
1998
1999 if (TTL_tile_empty(tile_next_import) == false)
2000 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
2001 *TTL_to_void_tensor(TTL_to_const_tensor(&next_import_ext_tensor)),
2002 simplex_buffer->event_in);
2003
2004 // The import/export has been started for the current tile, Move to the next
2005 // tile.
2006 simplex_buffer->common.index =
2007 (simplex_buffer->common.index + 1) %
2008 (sizeof((simplex_buffer->common.int_base)) / sizeof((simplex_buffer->common.int_base)[0])); // Write to.
2009
2010 // Retrieve buffer imported previously to read from now.
2011 const TTL_int_ulong_sub_tensor_t int_curr_buff_in = simplex_buffer->int_prev_imported;
2012 simplex_buffer->int_prev_imported = next_import_int_sub_tensor;
2013
2014 // Can write to out buffer according to size of curr_tile, rather than size
2015 // recently exported.
2016 const TTL_layout_t curr_int_layout =
2017 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.width);
2018 const TTL_int_ulong_sub_tensor_t int_curr_buff_out =
2019 TTL_create_int_sub_tensor(simplex_buffer->common.int_base[simplex_buffer->common.index],
2020 tile_current_export.shape,
2021 curr_int_layout,
2022 *TTL_to_const_tensor(&simplex_buffer->common.ext_tensor_in),
2023 tile_current_export.offset);
2024
2025 // Save last two tiles to prevent common repeated get_tile()'s.
2026 simplex_buffer->next_exported_tile = tile_current_export;
2027
2028 return TTL_create_io_tensors(int_curr_buff_in, int_curr_buff_out);
2029}
2030
2031static inline void __attribute__((overloadable)) TTL_finish_buffering(
2032 TTL_simplex_const_ulong_tensor_buffering_t *const simplex_buffering) {
2035}
static int TTL_tile_empty(TTL_tile_t tile)
Check if the tile passed is empty.
static TTL_tile_t TTL_create_empty_tile()
Create an empty tile. Empty means it has all dimensions set to zero.
event_t TTL_event_t
TTL_event_t is a pseudonym for OpenCL event_t.
#define __local
The opencl __local namespace is not supported in C.
unsigned char uchar
OpenCL supports uchar so provide the same in c.
unsigned long ulong
OpenCL supports ulong so provide the same in c.
unsigned int uint
OpenCL supports uint so provide the same in c.
unsigned short ushort
OpenCL supports ushort so provide the same in c.
static void TTL_wait(const int num_events, TTL_event_t *const events)
static TTL_ext_void_tensor_t TTL_create_ext_tensor(__global void *base, const TTL_shape_t shape, const TTL_layout_t layout, const TTL_offset_t offset, const TTL_dim_t elem_size)
const and non-const tensor creation functions.
static const TTL_const_ext_void_tensor_t * TTL_to_const_tensor(const TTL_ext_void_tensor_t *const tensor)
static TTL_int_void_tensor_t TTL_create_int_tensor(__local void *base, const TTL_shape_t shape, const TTL_layout_t layout, const TTL_offset_t offset, const TTL_dim_t elem_size)
const and non-const tensor creation functions.
static TTL_int_void_sub_tensor_t TTL_create_int_sub_tensor(__local void *base, const TTL_shape_t shape, const TTL_layout_t layout, const TTL_dim_t elem_size, const TTL_offset_t offset, const TTL_shape_t origin_shape, TTL_offset_t origin_offset)
const and non-const sub tensor creation functions.
static const TTL_ext_void_tensor_t * TTL_to_void_tensor(const TTL_ext_void_tensor_t *tensor)
static TTL_int_void_sub_tensor_t TTL_create_empty_int_sub_tensor(__local void *unused)
static const TTL_ext_void_sub_tensor_t * TTL_to_void_sub_tensor(const TTL_ext_void_sub_tensor_t *tensor)
static TTL_const_ext_void_tensor_t TTL_create_const_ext_tensor(__global const void *base, const TTL_shape_t shape, const TTL_layout_t layout, const TTL_offset_t offset, const TTL_dim_t elem_size)
create TTL_create_int_tensor_impl
static TTL_layout_t TTL_create_layout(void)
Create a 1D Description of a Tensor layout in memory.
static void TTL_import_sub_tensor(const TTL_int_void_sub_tensor_t internal_sub_tensor, const TTL_const_ext_void_tensor_t const_external_tensor, TTL_event_t *event)
Implementation of TTL_import_sub_tensor.
static void TTL_export(const TTL_const_int_void_tensor_t internal_tensor, const TTL_ext_void_tensor_t external_tensor, TTL_event_t *event)
Export the external tensor to the internal tensor returning when complete.
static TTL_io_void_tensor_t TTL_create_io_tensors(TTL_int_void_sub_tensor_t imported_to, TTL_int_void_sub_tensor_t to_export_from)
Create a TTL_io_tensors_t from a pair of tensors.
static TTL_io_void_tensor_t TTL_step_buffering(TTL_simplex_const_void_tensor_buffering_t *const simplex_buffer, TTL_tile_t tile_next_import, TTL_tile_t tile_current_export)
static void TTL_finish_buffering(TTL_simplex_const_void_tensor_buffering_t *const simplex_buffering)
static TTL_simplex_const_void_tensor_buffering_t TTL_start_simplex_buffering(__local void *int_base1, __local void *int_base2, __local void *int_base3, TTL_ext_void_tensor_t ext_tensor_in, TTL_ext_void_tensor_t ext_tensor_out, TTL_event_t *event_in, TTL_event_t *event_out, TTL_tile_t first_tile)
Create a TTL_simplex_buffering_t and begin the buffering process.
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const tensors in the appropriate address space
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Description of a Tensor layout in memory.
TTL_dim_t width
Number of elements along dimension x.
TTL_dim_t height
Number of rows along dimension y.
struct TTL_simplex_const_char_tensor_buffering_t::@102360275001346146356135303013060157261336330060 common
The information that is common to all pipeline schemes.
struct TTL_simplex_const_int_tensor_buffering_t::@207153370261263021107021301144274123304136255017 common
The information that is common to all pipeline schemes.
struct TTL_simplex_const_long_tensor_buffering_t::@172131060140046106327202050237250032314020347156 common
The information that is common to all pipeline schemes.
struct TTL_simplex_const_short_tensor_buffering_t::@167215357071057126005116027276060017063326102205 common
The information that is common to all pipeline schemes.
struct TTL_simplex_const_uchar_tensor_buffering_t::@354347163167162377277147335140275047251130327002 common
The information that is common to all pipeline schemes.
struct TTL_simplex_const_uint_tensor_buffering_t::@016135171034364243131262051236251075206112041375 common
The information that is common to all pipeline schemes.
struct TTL_simplex_const_ulong_tensor_buffering_t::@316021257366052355171112165027343072317200366310 common
The information that is common to all pipeline schemes.
struct TTL_simplex_const_ushort_tensor_buffering_t::@203377236204263326276015215353157373070357040032 common
The information that is common to all pipeline schemes.
struct TTL_simplex_const_void_tensor_buffering_t::@160310361015334024101107211353022336157272013024 common
The information that is common to all pipeline schemes.
TTL_offset_t offset
TTL_shape_t shape