Tensor Tiling Library
 
Loading...
Searching...
No Matches
TTL_duplex_scheme.h
Go to the documentation of this file.
1/*
2 * TTL_duplex_scheme.h
3 *
4 * Copyright (c) 2023 Mobileye
5 *
6 * Licensed under the Apache License, Version 2.0 (the License);
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an AS IS BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19// clang-format off
20/**
21 * @file
22 *
23 * Given pair of blocking import and export that can execute concurrently,
24 * TTL_duplex_buffering issues them together and then waits on both to complete,
25 * hopefully executing them in parallel to each other. This scheme uses two
26 * internal buffers, one for the import and one for the export. Note that the
27 * export is pipelined to pair the import of the current tile with the export of
28 * previous tile.
29
30 * The following table draws the pipelined actions performed in duplex buffering.
31 * It specifies which tile is processed in each iteration:
32 *
33 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |
34 * |-------------------|-----|-----|----------------------|---------------|
35 * | **Import** | 0 | 1 | i | |
36 * | **Wait Import** | 0 | 1 | i | |
37 * | **Compute** | 0 | 1 | i | |
38 * | **Export** | | 0 | i-1 | NumOfTiles-1 |
39 * | **WaitExport** | | 0 | i-1 | NumOfTiles-1 |
40 *
41 * Notice the epilog (\#NumOfTiles) which is an extra iteration.
42 *
43 * When including this file the following must be defined
44 *
45 * #define TTL_TENSOR_TYPE void
46 * #define TTL_TENSOR_TYPE uchar
47 * etc
48 *
49 * @example TTL_duplex_buffering.cl
50 */
51// clang-format on
52
53// This file presumes that the following have been pre included.
54// this is not done here for path reasons.
55// #include "TTL_core.h"
56// #include "TTL_import_export.h"
57// #include TTL_IMPORT_EXPORT_INCLUDE_H
58
59/**
60 * @def The structs used for this buffering type
61 */
62
63/**
64 * @brief Data required to perform duplex buffer pipelining.
65 *
66 * @see TTL_start_duplex_buffering for a description of duplex buffer
67 * pipelining.
68 */
69typedef struct {
70 struct {
71 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
72 0->1->0->1... etc */
73 __local void *int_base[2]; /*!< The internal base addresses of the pipelined tiles. */
74 TTL_ext_void_tensor_t ext_tensor_in; /*!< The external tensor being input */
75 TTL_ext_void_tensor_t ext_tensor_out; /*!< The external tensor being output */
76 } common; ///< The information that is common to all pipeline schemes
77
78 TTL_event_t (*events)[2]; ///< 2 Events are required, 1 first is used for
79 ///< external to internal transfers, the second for
80 ///< internal to external transfers
81
82 /**
83 * @brief Store of the buffers used for the previous import/export cycles.
84 *
85 */
86 struct {
89 } prev_out_tensors;
91
92/*
93 * Predeclare TTL_step_buffering.
94 */
95static inline TTL_io_void_tensor_t __attribute__((overloadable)) TTL_step_buffering(
96 TTL_duplex_const_void_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,
97 TTL_tile_t tile_current_export);
98
99/**
100 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process
101 *
102 * @param ext_tensor_in A tensor describing the input in global memory
103 * @param int_base_in The address of the local import buffer.
104 * @param ext_tensor_out A tensor describing the output in global memory
105 * @param int_base_out The address of the local export buffer.
106 * @param events A pointer to a list of 2 events.
107 * The first event in the list will be used for imports, the second event in
108 * the list will be used for exports.
109 * @param first_tile The first tile to fetch for the scheme
110 *
111 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.
112 *
113 * The first event in the list will be used for imports,
114 * the second event in the list will be used for exports.
115 * \n\n Example:
116 * @code
117 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};
118 *
119 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(
120 * ext_base_in, ext_layout_in, l_buffers[0],
121 * ext_base_out, ext_layout_out, l_buffers[1],
122 * &events);
123 * @endcode
124 * \n
125 *
126 * @return The TTL_duplex_buffering_t created from the input parameters.
127 *
128 * Solid description of duplex buffering here.
129 *
130 * The simplest form of duplex buffering takes the following flow.
131 *
132 * @startuml
133 *
134 * start
135 *
136 * :Create a TTL_tiler_t with TTL_create_tiler;
137 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers
138 * 1 input buffer, 1 output buffer;
139 * :NumberOfTiles = TTL_number_of_tiles(tiler);
140 *
141 * while (for each tile)
142 *
143 * :Import The Next Tile into the input buffer;
144 *
145 * :Process the Tile from the input buffer to the output buffer;
146 *
147 * :ExportThe Process Tile from into the output buffer;
148 *
149 * endwhile
150 *
151 * stop
152 *
153 * @enduml
154 *
155 * This can be optimized and standardized using the TTL_step_buffering
156 * call.
157 *
158 * @startuml
159 *
160 * start
161 *
162 * :Create a TTL_tiler_t with TTL_create_tiler;
163 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;
164 * :NumberOfTiles = TTL_number_of_tiles(tiler);
165 *
166 * while (for each tile)
167 *
168 * :Call TTL_step_buffering for the current tile
169 *
170 * This will import the current new tile and export the last tile
171 * in parallel;
172 *
173 * if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)
174 * :Process the Tile from the input buffer to the output buffer;
175 * endif
176 *
177 * endwhile
178 *
179 * stop
180 *
181 * @enduml
182 */
184 TTL_ext_void_tensor_t ext_tensor_in, __local void *int_base_in, TTL_ext_void_tensor_t ext_tensor_out,
185 __local void *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {
187 result.common.int_base[0] = int_base_in;
188 result.common.int_base[1] = int_base_out;
189
190 result.common.ext_tensor_in = ext_tensor_in;
191 result.common.ext_tensor_out = ext_tensor_out;
192 result.events = events;
195
196 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
197
198 return result;
199}
200
201static inline TTL_io_void_tensor_t __attribute__((overloadable)) TTL_step_buffering(
202 TTL_duplex_const_void_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,
203 TTL_tile_t tile_current_export) {
204 const TTL_layout_t next_import_layout =
205 TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);
206 const TTL_const_ext_void_tensor_t next_import_ext_tensor =
208 tile_current_import.shape,
209 duplex_buffering->common.ext_tensor_in.layout,
210 tile_current_import.offset,
211 duplex_buffering->common.ext_tensor_in.elem_size);
212 const TTL_int_void_sub_tensor_t next_import_int_sub_tensor =
213 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],
214 tile_current_import.shape,
215 next_import_layout,
216 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
217 tile_current_import.offset);
218
219 const TTL_const_int_void_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;
220 const TTL_ext_void_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;
221
222 if (TTL_tile_empty(tile_current_import) == false)
223 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
224 *TTL_to_void_tensor(&next_import_ext_tensor),
225 &(*duplex_buffering->events)[0]);
226
227 if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)
228 TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),
229 *TTL_to_void_tensor(&next_export_ext_tensor),
230 &(*duplex_buffering->events)[1]);
231
232 const TTL_layout_t int_export_layout =
233 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);
234 const TTL_ext_void_tensor_t to_export_to = TTL_create_ext_tensor(duplex_buffering->common.ext_tensor_out.base,
235 tile_current_export.shape,
236 duplex_buffering->common.ext_tensor_out.layout,
237 tile_current_export.offset,
238 duplex_buffering->common.ext_tensor_out.elem_size);
239 const TTL_int_void_sub_tensor_t to_export_from =
240 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],
241 tile_current_export.shape,
242 int_export_layout,
243 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
244 tile_current_export.offset);
245
246 duplex_buffering->prev_out_tensors.to_export_to = to_export_to;
247 duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);
248
249 TTL_wait(2, *duplex_buffering->events);
250
251 return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);
252}
253
254static inline void __attribute__((overloadable)) TTL_finish_buffering(
255 TTL_duplex_const_void_tensor_buffering_t *const duplex_buffering) {
257}
258/*
259 * TTL_duplex_scheme.h
260 *
261 * Copyright (c) 2023 Mobileye
262 *
263 * Licensed under the Apache License, Version 2.0 (the License);
264 * you may not use this file except in compliance with the License.
265 * You may obtain a copy of the License at
266 *
267 * http://www.apache.org/licenses/LICENSE-2.0
268 *
269 * Unless required by applicable law or agreed to in writing, software
270 * distributed under the License is distributed on an AS IS BASIS,
271 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
272 * See the License for the specific language governing permissions and
273 * limitations under the License.
274 */
275
276// clang-format off
277/**
278 * @file
279 *
280 * Given pair of blocking import and export that can execute concurrently,
281 * TTL_duplex_buffering issues them together and then waits on both to complete,
282 * hopefully executing them in parallel to each other. This scheme uses two
283 * internal buffers, one for the import and one for the export. Note that the
284 * export is pipelined to pair the import of the current tile with the export of
285 * previous tile.
286
287 * The following table draws the pipelined actions performed in duplex buffering.
288 * It specifies which tile is processed in each iteration:
289 *
290 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |
291 * |-------------------|-----|-----|----------------------|---------------|
292 * | **Import** | 0 | 1 | i | |
293 * | **Wait Import** | 0 | 1 | i | |
294 * | **Compute** | 0 | 1 | i | |
295 * | **Export** | | 0 | i-1 | NumOfTiles-1 |
296 * | **WaitExport** | | 0 | i-1 | NumOfTiles-1 |
297 *
298 * Notice the epilog (\#NumOfTiles) which is an extra iteration.
299 *
300 * When including this file the following must be defined
301 *
302 * #define TTL_TENSOR_TYPE void
303 * #define TTL_TENSOR_TYPE uchar
304 * etc
305 *
306 * @example TTL_duplex_buffering.cl
307 */
308// clang-format on
309
310// This file presumes that the following have been pre included.
311// this is not done here for path reasons.
312// #include "TTL_core.h"
313// #include "TTL_import_export.h"
314// #include TTL_IMPORT_EXPORT_INCLUDE_H
315
316/**
317 * @def The structs used for this buffering type
318 */
319
320/**
321 * @brief Data required to perform duplex buffer pipelining.
322 *
323 * @see TTL_start_duplex_buffering for a description of duplex buffer
324 * pipelining.
325 */
326typedef struct {
327 struct {
328 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
329 0->1->0->1... etc */
330 __local char *int_base[2]; /*!< The internal base addresses of the pipelined tiles. */
331 TTL_ext_char_tensor_t ext_tensor_in; /*!< The external tensor being input */
332 TTL_ext_char_tensor_t ext_tensor_out; /*!< The external tensor being output */
333 } common; ///< The information that is common to all pipeline schemes
334
335 TTL_event_t (*events)[2]; ///< 2 Events are required, 1 first is used for
336 ///< external to internal transfers, the second for
337 ///< internal to external transfers
338
339 /**
340 * @brief Store of the buffers used for the previous import/export cycles.
341 *
342 */
343 struct {
346 } prev_out_tensors;
348
349/*
350 * Predeclare TTL_step_buffering.
351 */
352static inline TTL_io_char_tensor_t __attribute__((overloadable)) TTL_step_buffering(
353 TTL_duplex_const_char_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,
354 TTL_tile_t tile_current_export);
355
356/**
357 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process
358 *
359 * @param ext_tensor_in A tensor describing the input in global memory
360 * @param int_base_in The address of the local import buffer.
361 * @param ext_tensor_out A tensor describing the output in global memory
362 * @param int_base_out The address of the local export buffer.
363 * @param events A pointer to a list of 2 events.
364 * The first event in the list will be used for imports, the second event in
365 * the list will be used for exports.
366 * @param first_tile The first tile to fetch for the scheme
367 *
368 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.
369 *
370 * The first event in the list will be used for imports,
371 * the second event in the list will be used for exports.
372 * \n\n Example:
373 * @code
374 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};
375 *
376 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(
377 * ext_base_in, ext_layout_in, l_buffers[0],
378 * ext_base_out, ext_layout_out, l_buffers[1],
379 * &events);
380 * @endcode
381 * \n
382 *
383 * @return The TTL_duplex_buffering_t created from the input parameters.
384 *
385 * Solid description of duplex buffering here.
386 *
387 * The simplest form of duplex buffering takes the following flow.
388 *
389 * @startuml
390 *
391 * start
392 *
393 * :Create a TTL_tiler_t with TTL_create_tiler;
394 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers
395 * 1 input buffer, 1 output buffer;
396 * :NumberOfTiles = TTL_number_of_tiles(tiler);
397 *
398 * while (for each tile)
399 *
400 * :Import The Next Tile into the input buffer;
401 *
402 * :Process the Tile from the input buffer to the output buffer;
403 *
404 * :ExportThe Process Tile from into the output buffer;
405 *
406 * endwhile
407 *
408 * stop
409 *
410 * @enduml
411 *
412 * This can be optimized and standardized using the TTL_step_buffering
413 * call.
414 *
415 * @startuml
416 *
417 * start
418 *
419 * :Create a TTL_tiler_t with TTL_create_tiler;
420 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;
421 * :NumberOfTiles = TTL_number_of_tiles(tiler);
422 *
423 * while (for each tile)
424 *
425 * :Call TTL_step_buffering for the current tile
426 *
427 * This will import the current new tile and export the last tile
428 * in parallel;
429 *
430 * if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)
431 * :Process the Tile from the input buffer to the output buffer;
432 * endif
433 *
434 * endwhile
435 *
436 * stop
437 *
438 * @enduml
439 */
441 TTL_ext_char_tensor_t ext_tensor_in, __local char *int_base_in, TTL_ext_char_tensor_t ext_tensor_out,
442 __local char *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {
444 result.common.int_base[0] = int_base_in;
445 result.common.int_base[1] = int_base_out;
446
447 result.common.ext_tensor_in = ext_tensor_in;
448 result.common.ext_tensor_out = ext_tensor_out;
449 result.events = events;
452
453 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
454
455 return result;
456}
457
458static inline TTL_io_char_tensor_t __attribute__((overloadable)) TTL_step_buffering(
459 TTL_duplex_const_char_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,
460 TTL_tile_t tile_current_export) {
461 const TTL_layout_t next_import_layout =
462 TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);
463 const TTL_const_ext_char_tensor_t next_import_ext_tensor =
465 tile_current_import.shape,
466 duplex_buffering->common.ext_tensor_in.layout,
467 tile_current_import.offset,
468 duplex_buffering->common.ext_tensor_in.elem_size);
469 const TTL_int_char_sub_tensor_t next_import_int_sub_tensor =
470 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],
471 tile_current_import.shape,
472 next_import_layout,
473 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
474 tile_current_import.offset);
475
476 const TTL_const_int_char_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;
477 const TTL_ext_char_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;
478
479 if (TTL_tile_empty(tile_current_import) == false)
480 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
481 *TTL_to_void_tensor(&next_import_ext_tensor),
482 &(*duplex_buffering->events)[0]);
483
484 if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)
485 TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),
486 *TTL_to_void_tensor(&next_export_ext_tensor),
487 &(*duplex_buffering->events)[1]);
488
489 const TTL_layout_t int_export_layout =
490 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);
491 const TTL_ext_char_tensor_t to_export_to = TTL_create_ext_tensor(duplex_buffering->common.ext_tensor_out.base,
492 tile_current_export.shape,
493 duplex_buffering->common.ext_tensor_out.layout,
494 tile_current_export.offset,
495 duplex_buffering->common.ext_tensor_out.elem_size);
496 const TTL_int_char_sub_tensor_t to_export_from =
497 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],
498 tile_current_export.shape,
499 int_export_layout,
500 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
501 tile_current_export.offset);
502
503 duplex_buffering->prev_out_tensors.to_export_to = to_export_to;
504 duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);
505
506 TTL_wait(2, *duplex_buffering->events);
507
508 return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);
509}
510
511static inline void __attribute__((overloadable)) TTL_finish_buffering(
512 TTL_duplex_const_char_tensor_buffering_t *const duplex_buffering) {
514}
515/*
516 * TTL_duplex_scheme.h
517 *
518 * Copyright (c) 2023 Mobileye
519 *
520 * Licensed under the Apache License, Version 2.0 (the License);
521 * you may not use this file except in compliance with the License.
522 * You may obtain a copy of the License at
523 *
524 * http://www.apache.org/licenses/LICENSE-2.0
525 *
526 * Unless required by applicable law or agreed to in writing, software
527 * distributed under the License is distributed on an AS IS BASIS,
528 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
529 * See the License for the specific language governing permissions and
530 * limitations under the License.
531 */
532
533// clang-format off
534/**
535 * @file
536 *
537 * Given pair of blocking import and export that can execute concurrently,
538 * TTL_duplex_buffering issues them together and then waits on both to complete,
539 * hopefully executing them in parallel to each other. This scheme uses two
540 * internal buffers, one for the import and one for the export. Note that the
541 * export is pipelined to pair the import of the current tile with the export of
542 * previous tile.
543
544 * The following table draws the pipelined actions performed in duplex buffering.
545 * It specifies which tile is processed in each iteration:
546 *
547 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |
548 * |-------------------|-----|-----|----------------------|---------------|
549 * | **Import** | 0 | 1 | i | |
550 * | **Wait Import** | 0 | 1 | i | |
551 * | **Compute** | 0 | 1 | i | |
552 * | **Export** | | 0 | i-1 | NumOfTiles-1 |
553 * | **WaitExport** | | 0 | i-1 | NumOfTiles-1 |
554 *
555 * Notice the epilog (\#NumOfTiles) which is an extra iteration.
556 *
557 * When including this file the following must be defined
558 *
559 * #define TTL_TENSOR_TYPE void
560 * #define TTL_TENSOR_TYPE uchar
561 * etc
562 *
563 * @example TTL_duplex_buffering.cl
564 */
565// clang-format on
566
567// This file presumes that the following have been pre included.
568// this is not done here for path reasons.
569// #include "TTL_core.h"
570// #include "TTL_import_export.h"
571// #include TTL_IMPORT_EXPORT_INCLUDE_H
572
573/**
574 * @def The structs used for this buffering type
575 */
576
577/**
578 * @brief Data required to perform duplex buffer pipelining.
579 *
580 * @see TTL_start_duplex_buffering for a description of duplex buffer
581 * pipelining.
582 */
583typedef struct {
584 struct {
585 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
586 0->1->0->1... etc */
587 __local uchar *int_base[2]; /*!< The internal base addresses of the pipelined tiles. */
588 TTL_ext_uchar_tensor_t ext_tensor_in; /*!< The external tensor being input */
589 TTL_ext_uchar_tensor_t ext_tensor_out; /*!< The external tensor being output */
590 } common; ///< The information that is common to all pipeline schemes
591
592 TTL_event_t (*events)[2]; ///< 2 Events are required, 1 first is used for
593 ///< external to internal transfers, the second for
594 ///< internal to external transfers
595
596 /**
597 * @brief Store of the buffers used for the previous import/export cycles.
598 *
599 */
600 struct {
603 } prev_out_tensors;
605
606/*
607 * Predeclare TTL_step_buffering.
608 */
609static inline TTL_io_uchar_tensor_t __attribute__((overloadable)) TTL_step_buffering(
610 TTL_duplex_const_uchar_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,
611 TTL_tile_t tile_current_export);
612
613/**
614 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process
615 *
616 * @param ext_tensor_in A tensor describing the input in global memory
617 * @param int_base_in The address of the local import buffer.
618 * @param ext_tensor_out A tensor describing the output in global memory
619 * @param int_base_out The address of the local export buffer.
620 * @param events A pointer to a list of 2 events.
621 * The first event in the list will be used for imports, the second event in
622 * the list will be used for exports.
623 * @param first_tile The first tile to fetch for the scheme
624 *
625 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.
626 *
627 * The first event in the list will be used for imports,
628 * the second event in the list will be used for exports.
629 * \n\n Example:
630 * @code
631 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};
632 *
633 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(
634 * ext_base_in, ext_layout_in, l_buffers[0],
635 * ext_base_out, ext_layout_out, l_buffers[1],
636 * &events);
637 * @endcode
638 * \n
639 *
640 * @return The TTL_duplex_buffering_t created from the input parameters.
641 *
642 * Solid description of duplex buffering here.
643 *
644 * The simplest form of duplex buffering takes the following flow.
645 *
646 * @startuml
647 *
648 * start
649 *
650 * :Create a TTL_tiler_t with TTL_create_tiler;
651 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers
652 * 1 input buffer, 1 output buffer;
653 * :NumberOfTiles = TTL_number_of_tiles(tiler);
654 *
655 * while (for each tile)
656 *
657 * :Import The Next Tile into the input buffer;
658 *
659 * :Process the Tile from the input buffer to the output buffer;
660 *
661 * :ExportThe Process Tile from into the output buffer;
662 *
663 * endwhile
664 *
665 * stop
666 *
667 * @enduml
668 *
669 * This can be optimized and standardized using the TTL_step_buffering
670 * call.
671 *
672 * @startuml
673 *
674 * start
675 *
676 * :Create a TTL_tiler_t with TTL_create_tiler;
677 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;
678 * :NumberOfTiles = TTL_number_of_tiles(tiler);
679 *
680 * while (for each tile)
681 *
682 * :Call TTL_step_buffering for the current tile
683 *
684 * This will import the current new tile and export the last tile
685 * in parallel;
686 *
687 * if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)
688 * :Process the Tile from the input buffer to the output buffer;
689 * endif
690 *
691 * endwhile
692 *
693 * stop
694 *
695 * @enduml
696 */
698 TTL_ext_uchar_tensor_t ext_tensor_in, __local uchar *int_base_in, TTL_ext_uchar_tensor_t ext_tensor_out,
699 __local uchar *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {
701 result.common.int_base[0] = int_base_in;
702 result.common.int_base[1] = int_base_out;
703
704 result.common.ext_tensor_in = ext_tensor_in;
705 result.common.ext_tensor_out = ext_tensor_out;
706 result.events = events;
709
710 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
711
712 return result;
713}
714
715static inline TTL_io_uchar_tensor_t __attribute__((overloadable)) TTL_step_buffering(
716 TTL_duplex_const_uchar_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,
717 TTL_tile_t tile_current_export) {
718 const TTL_layout_t next_import_layout =
719 TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);
720 const TTL_const_ext_uchar_tensor_t next_import_ext_tensor =
722 tile_current_import.shape,
723 duplex_buffering->common.ext_tensor_in.layout,
724 tile_current_import.offset,
725 duplex_buffering->common.ext_tensor_in.elem_size);
726 const TTL_int_uchar_sub_tensor_t next_import_int_sub_tensor =
727 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],
728 tile_current_import.shape,
729 next_import_layout,
730 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
731 tile_current_import.offset);
732
733 const TTL_const_int_uchar_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;
734 const TTL_ext_uchar_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;
735
736 if (TTL_tile_empty(tile_current_import) == false)
737 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
738 *TTL_to_void_tensor(&next_import_ext_tensor),
739 &(*duplex_buffering->events)[0]);
740
741 if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)
742 TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),
743 *TTL_to_void_tensor(&next_export_ext_tensor),
744 &(*duplex_buffering->events)[1]);
745
746 const TTL_layout_t int_export_layout =
747 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);
748 const TTL_ext_uchar_tensor_t to_export_to =
750 tile_current_export.shape,
751 duplex_buffering->common.ext_tensor_out.layout,
752 tile_current_export.offset,
753 duplex_buffering->common.ext_tensor_out.elem_size);
754 const TTL_int_uchar_sub_tensor_t to_export_from =
755 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],
756 tile_current_export.shape,
757 int_export_layout,
758 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
759 tile_current_export.offset);
760
761 duplex_buffering->prev_out_tensors.to_export_to = to_export_to;
762 duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);
763
764 TTL_wait(2, *duplex_buffering->events);
765
766 return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);
767}
768
769static inline void __attribute__((overloadable)) TTL_finish_buffering(
770 TTL_duplex_const_uchar_tensor_buffering_t *const duplex_buffering) {
772}
773/*
774 * TTL_duplex_scheme.h
775 *
776 * Copyright (c) 2023 Mobileye
777 *
778 * Licensed under the Apache License, Version 2.0 (the License);
779 * you may not use this file except in compliance with the License.
780 * You may obtain a copy of the License at
781 *
782 * http://www.apache.org/licenses/LICENSE-2.0
783 *
784 * Unless required by applicable law or agreed to in writing, software
785 * distributed under the License is distributed on an AS IS BASIS,
786 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
787 * See the License for the specific language governing permissions and
788 * limitations under the License.
789 */
790
791// clang-format off
792/**
793 * @file
794 *
795 * Given pair of blocking import and export that can execute concurrently,
796 * TTL_duplex_buffering issues them together and then waits on both to complete,
797 * hopefully executing them in parallel to each other. This scheme uses two
798 * internal buffers, one for the import and one for the export. Note that the
799 * export is pipelined to pair the import of the current tile with the export of
800 * previous tile.
801
802 * The following table draws the pipelined actions performed in duplex buffering.
803 * It specifies which tile is processed in each iteration:
804 *
805 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |
806 * |-------------------|-----|-----|----------------------|---------------|
807 * | **Import** | 0 | 1 | i | |
808 * | **Wait Import** | 0 | 1 | i | |
809 * | **Compute** | 0 | 1 | i | |
810 * | **Export** | | 0 | i-1 | NumOfTiles-1 |
811 * | **WaitExport** | | 0 | i-1 | NumOfTiles-1 |
812 *
813 * Notice the epilog (\#NumOfTiles) which is an extra iteration.
814 *
815 * When including this file the following must be defined
816 *
817 * #define TTL_TENSOR_TYPE void
818 * #define TTL_TENSOR_TYPE uchar
819 * etc
820 *
821 * @example TTL_duplex_buffering.cl
822 */
823// clang-format on
824
825// This file presumes that the following have been pre included.
826// this is not done here for path reasons.
827// #include "TTL_core.h"
828// #include "TTL_import_export.h"
829// #include TTL_IMPORT_EXPORT_INCLUDE_H
830
831/**
832 * @def The structs used for this buffering type
833 */
834
835/**
836 * @brief Data required to perform duplex buffer pipelining.
837 *
838 * @see TTL_start_duplex_buffering for a description of duplex buffer
839 * pipelining.
840 */
841typedef struct {
842 struct {
843 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
844 0->1->0->1... etc */
845 __local int *int_base[2]; /*!< The internal base addresses of the pipelined tiles. */
846 TTL_ext_int_tensor_t ext_tensor_in; /*!< The external tensor being input */
847 TTL_ext_int_tensor_t ext_tensor_out; /*!< The external tensor being output */
848 } common; ///< The information that is common to all pipeline schemes
849
850 TTL_event_t (*events)[2]; ///< 2 Events are required, 1 first is used for
851 ///< external to internal transfers, the second for
852 ///< internal to external transfers
853
854 /**
855 * @brief Store of the buffers used for the previous import/export cycles.
856 *
857 */
858 struct {
861 } prev_out_tensors;
863
864/*
865 * Predeclare TTL_step_buffering.
866 */
867static inline TTL_io_int_tensor_t __attribute__((overloadable)) TTL_step_buffering(
868 TTL_duplex_const_int_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,
869 TTL_tile_t tile_current_export);
870
871/**
872 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process
873 *
874 * @param ext_tensor_in A tensor describing the input in global memory
875 * @param int_base_in The address of the local import buffer.
876 * @param ext_tensor_out A tensor describing the output in global memory
877 * @param int_base_out The address of the local export buffer.
878 * @param events A pointer to a list of 2 events.
879 * The first event in the list will be used for imports, the second event in
880 * the list will be used for exports.
881 * @param first_tile The first tile to fetch for the scheme
882 *
883 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.
884 *
885 * The first event in the list will be used for imports,
886 * the second event in the list will be used for exports.
887 * \n\n Example:
888 * @code
889 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};
890 *
891 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(
892 * ext_base_in, ext_layout_in, l_buffers[0],
893 * ext_base_out, ext_layout_out, l_buffers[1],
894 * &events);
895 * @endcode
896 * \n
897 *
898 * @return The TTL_duplex_buffering_t created from the input parameters.
899 *
900 * Solid description of duplex buffering here.
901 *
902 * The simplest form of duplex buffering takes the following flow.
903 *
904 * @startuml
905 *
906 * start
907 *
908 * :Create a TTL_tiler_t with TTL_create_tiler;
909 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers
910 * 1 input buffer, 1 output buffer;
911 * :NumberOfTiles = TTL_number_of_tiles(tiler);
912 *
913 * while (for each tile)
914 *
915 * :Import The Next Tile into the input buffer;
916 *
917 * :Process the Tile from the input buffer to the output buffer;
918 *
919 * :ExportThe Process Tile from into the output buffer;
920 *
921 * endwhile
922 *
923 * stop
924 *
925 * @enduml
926 *
927 * This can be optimized and standardized using the TTL_step_buffering
928 * call.
929 *
930 * @startuml
931 *
932 * start
933 *
934 * :Create a TTL_tiler_t with TTL_create_tiler;
935 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;
936 * :NumberOfTiles = TTL_number_of_tiles(tiler);
937 *
938 * while (for each tile)
939 *
940 * :Call TTL_step_buffering for the current tile
941 *
942 * This will import the current new tile and export the last tile
943 * in parallel;
944 *
945 * if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)
946 * :Process the Tile from the input buffer to the output buffer;
947 * endif
948 *
949 * endwhile
950 *
951 * stop
952 *
953 * @enduml
954 */
955static inline TTL_duplex_const_int_tensor_buffering_t __attribute__((overloadable)) TTL_start_duplex_buffering(
956 TTL_ext_int_tensor_t ext_tensor_in, __local int *int_base_in, TTL_ext_int_tensor_t ext_tensor_out,
957 __local int *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {
959 result.common.int_base[0] = int_base_in;
960 result.common.int_base[1] = int_base_out;
961
962 result.common.ext_tensor_in = ext_tensor_in;
963 result.common.ext_tensor_out = ext_tensor_out;
964 result.events = events;
967
968 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
969
970 return result;
971}
972
973static inline TTL_io_int_tensor_t __attribute__((overloadable)) TTL_step_buffering(
974 TTL_duplex_const_int_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,
975 TTL_tile_t tile_current_export) {
976 const TTL_layout_t next_import_layout =
977 TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);
978 const TTL_const_ext_int_tensor_t next_import_ext_tensor =
980 tile_current_import.shape,
981 duplex_buffering->common.ext_tensor_in.layout,
982 tile_current_import.offset,
983 duplex_buffering->common.ext_tensor_in.elem_size);
984 const TTL_int_int_sub_tensor_t next_import_int_sub_tensor =
985 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],
986 tile_current_import.shape,
987 next_import_layout,
988 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
989 tile_current_import.offset);
990
991 const TTL_const_int_int_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;
992 const TTL_ext_int_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;
993
994 if (TTL_tile_empty(tile_current_import) == false)
995 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
996 *TTL_to_void_tensor(&next_import_ext_tensor),
997 &(*duplex_buffering->events)[0]);
998
999 if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)
1000 TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),
1001 *TTL_to_void_tensor(&next_export_ext_tensor),
1002 &(*duplex_buffering->events)[1]);
1003
1004 const TTL_layout_t int_export_layout =
1005 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);
1006 const TTL_ext_int_tensor_t to_export_to = TTL_create_ext_tensor(duplex_buffering->common.ext_tensor_out.base,
1007 tile_current_export.shape,
1008 duplex_buffering->common.ext_tensor_out.layout,
1009 tile_current_export.offset,
1010 duplex_buffering->common.ext_tensor_out.elem_size);
1011 const TTL_int_int_sub_tensor_t to_export_from =
1012 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],
1013 tile_current_export.shape,
1014 int_export_layout,
1015 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
1016 tile_current_export.offset);
1017
1018 duplex_buffering->prev_out_tensors.to_export_to = to_export_to;
1019 duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);
1020
1021 TTL_wait(2, *duplex_buffering->events);
1022
1023 return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);
1024}
1025
1026static inline void __attribute__((overloadable)) TTL_finish_buffering(
1027 TTL_duplex_const_int_tensor_buffering_t *const duplex_buffering) {
1029}
1030/*
1031 * TTL_duplex_scheme.h
1032 *
1033 * Copyright (c) 2023 Mobileye
1034 *
1035 * Licensed under the Apache License, Version 2.0 (the License);
1036 * you may not use this file except in compliance with the License.
1037 * You may obtain a copy of the License at
1038 *
1039 * http://www.apache.org/licenses/LICENSE-2.0
1040 *
1041 * Unless required by applicable law or agreed to in writing, software
1042 * distributed under the License is distributed on an AS IS BASIS,
1043 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1044 * See the License for the specific language governing permissions and
1045 * limitations under the License.
1046 */
1047
1048// clang-format off
1049/**
1050 * @file
1051 *
1052 * Given pair of blocking import and export that can execute concurrently,
1053 * TTL_duplex_buffering issues them together and then waits on both to complete,
1054 * hopefully executing them in parallel to each other. This scheme uses two
1055 * internal buffers, one for the import and one for the export. Note that the
1056 * export is pipelined to pair the import of the current tile with the export of
1057 * previous tile.
1058
1059 * The following table draws the pipelined actions performed in duplex buffering.
1060 * It specifies which tile is processed in each iteration:
1061 *
1062 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |
1063 * |-------------------|-----|-----|----------------------|---------------|
1064 * | **Import** | 0 | 1 | i | |
1065 * | **Wait Import** | 0 | 1 | i | |
1066 * | **Compute** | 0 | 1 | i | |
1067 * | **Export** | | 0 | i-1 | NumOfTiles-1 |
1068 * | **WaitExport** | | 0 | i-1 | NumOfTiles-1 |
1069 *
1070 * Notice the epilog (\#NumOfTiles) which is an extra iteration.
1071 *
1072 * When including this file the following must be defined
1073 *
1074 * #define TTL_TENSOR_TYPE void
1075 * #define TTL_TENSOR_TYPE uchar
1076 * etc
1077 *
1078 * @example TTL_duplex_buffering.cl
1079 */
1080// clang-format on
1081
1082// This file presumes that the following have been pre included.
1083// this is not done here for path reasons.
1084// #include "TTL_core.h"
1085// #include "TTL_import_export.h"
1086// #include TTL_IMPORT_EXPORT_INCLUDE_H
1087
1088/**
1089 * @def The structs used for this buffering type
1090 */
1091
1092/**
1093 * @brief Data required to perform duplex buffer pipelining.
1094 *
1095 * @see TTL_start_duplex_buffering for a description of duplex buffer
1096 * pipelining.
1097 */
1098typedef struct {
1099 struct {
1100 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
1101 0->1->0->1... etc */
1102 __local uint *int_base[2]; /*!< The internal base addresses of the pipelined tiles. */
1103 TTL_ext_uint_tensor_t ext_tensor_in; /*!< The external tensor being input */
1104 TTL_ext_uint_tensor_t ext_tensor_out; /*!< The external tensor being output */
1105 } common; ///< The information that is common to all pipeline schemes
1106
1107 TTL_event_t (*events)[2]; ///< 2 Events are required, 1 first is used for
1108 ///< external to internal transfers, the second for
1109 ///< internal to external transfers
1110
1111 /**
1112 * @brief Store of the buffers used for the previous import/export cycles.
1113 *
1114 */
1115 struct {
1118 } prev_out_tensors;
1120
1121/*
1122 * Predeclare TTL_step_buffering.
1123 */
1124static inline TTL_io_uint_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1125 TTL_duplex_const_uint_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,
1126 TTL_tile_t tile_current_export);
1127
1128/**
1129 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process
1130 *
1131 * @param ext_tensor_in A tensor describing the input in global memory
1132 * @param int_base_in The address of the local import buffer.
1133 * @param ext_tensor_out A tensor describing the output in global memory
1134 * @param int_base_out The address of the local export buffer.
1135 * @param events A pointer to a list of 2 events.
1136 * The first event in the list will be used for imports, the second event in
1137 * the list will be used for exports.
1138 * @param first_tile The first tile to fetch for the scheme
1139 *
1140 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.
1141 *
1142 * The first event in the list will be used for imports,
1143 * the second event in the list will be used for exports.
1144 * \n\n Example:
1145 * @code
1146 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};
1147 *
1148 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(
1149 * ext_base_in, ext_layout_in, l_buffers[0],
1150 * ext_base_out, ext_layout_out, l_buffers[1],
1151 * &events);
1152 * @endcode
1153 * \n
1154 *
1155 * @return The TTL_duplex_buffering_t created from the input parameters.
1156 *
1157 * Solid description of duplex buffering here.
1158 *
1159 * The simplest form of duplex buffering takes the following flow.
1160 *
1161 * @startuml
1162 *
1163 * start
1164 *
1165 * :Create a TTL_tiler_t with TTL_create_tiler;
1166 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers
1167 * 1 input buffer, 1 output buffer;
1168 * :NumberOfTiles = TTL_number_of_tiles(tiler);
1169 *
1170 * while (for each tile)
1171 *
1172 * :Import The Next Tile into the input buffer;
1173 *
1174 * :Process the Tile from the input buffer to the output buffer;
1175 *
1176 * :ExportThe Process Tile from into the output buffer;
1177 *
1178 * endwhile
1179 *
1180 * stop
1181 *
1182 * @enduml
1183 *
1184 * This can be optimized and standardized using the TTL_step_buffering
1185 * call.
1186 *
1187 * @startuml
1188 *
1189 * start
1190 *
1191 * :Create a TTL_tiler_t with TTL_create_tiler;
1192 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;
1193 * :NumberOfTiles = TTL_number_of_tiles(tiler);
1194 *
1195 * while (for each tile)
1196 *
1197 * :Call TTL_step_buffering for the current tile
1198 *
1199 * This will import the current new tile and export the last tile
1200 * in parallel;
1201 *
1202 * if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)
1203 * :Process the Tile from the input buffer to the output buffer;
1204 * endif
1205 *
1206 * endwhile
1207 *
1208 * stop
1209 *
1210 * @enduml
1211 */
1213 TTL_ext_uint_tensor_t ext_tensor_in, __local uint *int_base_in, TTL_ext_uint_tensor_t ext_tensor_out,
1214 __local uint *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {
1216 result.common.int_base[0] = int_base_in;
1217 result.common.int_base[1] = int_base_out;
1218
1219 result.common.ext_tensor_in = ext_tensor_in;
1220 result.common.ext_tensor_out = ext_tensor_out;
1221 result.events = events;
1224
1225 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
1226
1227 return result;
1228}
1229
1230static inline TTL_io_uint_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1231 TTL_duplex_const_uint_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,
1232 TTL_tile_t tile_current_export) {
1233 const TTL_layout_t next_import_layout =
1234 TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);
1235 const TTL_const_ext_uint_tensor_t next_import_ext_tensor =
1237 tile_current_import.shape,
1238 duplex_buffering->common.ext_tensor_in.layout,
1239 tile_current_import.offset,
1240 duplex_buffering->common.ext_tensor_in.elem_size);
1241 const TTL_int_uint_sub_tensor_t next_import_int_sub_tensor =
1242 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],
1243 tile_current_import.shape,
1244 next_import_layout,
1245 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
1246 tile_current_import.offset);
1247
1248 const TTL_const_int_uint_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;
1249 const TTL_ext_uint_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;
1250
1251 if (TTL_tile_empty(tile_current_import) == false)
1252 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
1253 *TTL_to_void_tensor(&next_import_ext_tensor),
1254 &(*duplex_buffering->events)[0]);
1255
1256 if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)
1257 TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),
1258 *TTL_to_void_tensor(&next_export_ext_tensor),
1259 &(*duplex_buffering->events)[1]);
1260
1261 const TTL_layout_t int_export_layout =
1262 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);
1263 const TTL_ext_uint_tensor_t to_export_to = TTL_create_ext_tensor(duplex_buffering->common.ext_tensor_out.base,
1264 tile_current_export.shape,
1265 duplex_buffering->common.ext_tensor_out.layout,
1266 tile_current_export.offset,
1267 duplex_buffering->common.ext_tensor_out.elem_size);
1268 const TTL_int_uint_sub_tensor_t to_export_from =
1269 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],
1270 tile_current_export.shape,
1271 int_export_layout,
1272 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
1273 tile_current_export.offset);
1274
1275 duplex_buffering->prev_out_tensors.to_export_to = to_export_to;
1276 duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);
1277
1278 TTL_wait(2, *duplex_buffering->events);
1279
1280 return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);
1281}
1282
1283static inline void __attribute__((overloadable)) TTL_finish_buffering(
1284 TTL_duplex_const_uint_tensor_buffering_t *const duplex_buffering) {
1286}
1287/*
1288 * TTL_duplex_scheme.h
1289 *
1290 * Copyright (c) 2023 Mobileye
1291 *
1292 * Licensed under the Apache License, Version 2.0 (the License);
1293 * you may not use this file except in compliance with the License.
1294 * You may obtain a copy of the License at
1295 *
1296 * http://www.apache.org/licenses/LICENSE-2.0
1297 *
1298 * Unless required by applicable law or agreed to in writing, software
1299 * distributed under the License is distributed on an AS IS BASIS,
1300 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1301 * See the License for the specific language governing permissions and
1302 * limitations under the License.
1303 */
1304
1305// clang-format off
1306/**
1307 * @file
1308 *
1309 * Given pair of blocking import and export that can execute concurrently,
1310 * TTL_duplex_buffering issues them together and then waits on both to complete,
1311 * hopefully executing them in parallel to each other. This scheme uses two
1312 * internal buffers, one for the import and one for the export. Note that the
1313 * export is pipelined to pair the import of the current tile with the export of
1314 * previous tile.
1315
1316 * The following table draws the pipelined actions performed in duplex buffering.
1317 * It specifies which tile is processed in each iteration:
1318 *
1319 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |
1320 * |-------------------|-----|-----|----------------------|---------------|
1321 * | **Import** | 0 | 1 | i | |
1322 * | **Wait Import** | 0 | 1 | i | |
1323 * | **Compute** | 0 | 1 | i | |
1324 * | **Export** | | 0 | i-1 | NumOfTiles-1 |
1325 * | **WaitExport** | | 0 | i-1 | NumOfTiles-1 |
1326 *
1327 * Notice the epilog (\#NumOfTiles) which is an extra iteration.
1328 *
1329 * When including this file the following must be defined
1330 *
1331 * #define TTL_TENSOR_TYPE void
1332 * #define TTL_TENSOR_TYPE uchar
1333 * etc
1334 *
1335 * @example TTL_duplex_buffering.cl
1336 */
1337// clang-format on
1338
1339// This file presumes that the following have been pre included.
1340// this is not done here for path reasons.
1341// #include "TTL_core.h"
1342// #include "TTL_import_export.h"
1343// #include TTL_IMPORT_EXPORT_INCLUDE_H
1344
1345/**
1346 * @def The structs used for this buffering type
1347 */
1348
1349/**
1350 * @brief Data required to perform duplex buffer pipelining.
1351 *
1352 * @see TTL_start_duplex_buffering for a description of duplex buffer
1353 * pipelining.
1354 */
1355typedef struct {
1356 struct {
1357 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
1358 0->1->0->1... etc */
1359 __local short *int_base[2]; /*!< The internal base addresses of the pipelined tiles. */
1360 TTL_ext_short_tensor_t ext_tensor_in; /*!< The external tensor being input */
1361 TTL_ext_short_tensor_t ext_tensor_out; /*!< The external tensor being output */
1362 } common; ///< The information that is common to all pipeline schemes
1363
1364 TTL_event_t (*events)[2]; ///< 2 Events are required, 1 first is used for
1365 ///< external to internal transfers, the second for
1366 ///< internal to external transfers
1367
1368 /**
1369 * @brief Store of the buffers used for the previous import/export cycles.
1370 *
1371 */
1372 struct {
1375 } prev_out_tensors;
1377
1378/*
1379 * Predeclare TTL_step_buffering.
1380 */
1381static inline TTL_io_short_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1382 TTL_duplex_const_short_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,
1383 TTL_tile_t tile_current_export);
1384
1385/**
1386 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process
1387 *
1388 * @param ext_tensor_in A tensor describing the input in global memory
1389 * @param int_base_in The address of the local import buffer.
1390 * @param ext_tensor_out A tensor describing the output in global memory
1391 * @param int_base_out The address of the local export buffer.
1392 * @param events A pointer to a list of 2 events.
1393 * The first event in the list will be used for imports, the second event in
1394 * the list will be used for exports.
1395 * @param first_tile The first tile to fetch for the scheme
1396 *
1397 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.
1398 *
1399 * The first event in the list will be used for imports,
1400 * the second event in the list will be used for exports.
1401 * \n\n Example:
1402 * @code
1403 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};
1404 *
1405 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(
1406 * ext_base_in, ext_layout_in, l_buffers[0],
1407 * ext_base_out, ext_layout_out, l_buffers[1],
1408 * &events);
1409 * @endcode
1410 * \n
1411 *
1412 * @return The TTL_duplex_buffering_t created from the input parameters.
1413 *
1414 * Solid description of duplex buffering here.
1415 *
1416 * The simplest form of duplex buffering takes the following flow.
1417 *
1418 * @startuml
1419 *
1420 * start
1421 *
1422 * :Create a TTL_tiler_t with TTL_create_tiler;
1423 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers
1424 * 1 input buffer, 1 output buffer;
1425 * :NumberOfTiles = TTL_number_of_tiles(tiler);
1426 *
1427 * while (for each tile)
1428 *
1429 * :Import The Next Tile into the input buffer;
1430 *
1431 * :Process the Tile from the input buffer to the output buffer;
1432 *
1433 * :ExportThe Process Tile from into the output buffer;
1434 *
1435 * endwhile
1436 *
1437 * stop
1438 *
1439 * @enduml
1440 *
1441 * This can be optimized and standardized using the TTL_step_buffering
1442 * call.
1443 *
1444 * @startuml
1445 *
1446 * start
1447 *
1448 * :Create a TTL_tiler_t with TTL_create_tiler;
1449 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;
1450 * :NumberOfTiles = TTL_number_of_tiles(tiler);
1451 *
1452 * while (for each tile)
1453 *
1454 * :Call TTL_step_buffering for the current tile
1455 *
1456 * This will import the current new tile and export the last tile
1457 * in parallel;
1458 *
1459 * if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)
1460 * :Process the Tile from the input buffer to the output buffer;
1461 * endif
1462 *
1463 * endwhile
1464 *
1465 * stop
1466 *
1467 * @enduml
1468 */
1470 TTL_ext_short_tensor_t ext_tensor_in, __local short *int_base_in, TTL_ext_short_tensor_t ext_tensor_out,
1471 __local short *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {
1473 result.common.int_base[0] = int_base_in;
1474 result.common.int_base[1] = int_base_out;
1475
1476 result.common.ext_tensor_in = ext_tensor_in;
1477 result.common.ext_tensor_out = ext_tensor_out;
1478 result.events = events;
1481
1482 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
1483
1484 return result;
1485}
1486
1487static inline TTL_io_short_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1488 TTL_duplex_const_short_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,
1489 TTL_tile_t tile_current_export) {
1490 const TTL_layout_t next_import_layout =
1491 TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);
1492 const TTL_const_ext_short_tensor_t next_import_ext_tensor =
1494 tile_current_import.shape,
1495 duplex_buffering->common.ext_tensor_in.layout,
1496 tile_current_import.offset,
1497 duplex_buffering->common.ext_tensor_in.elem_size);
1498 const TTL_int_short_sub_tensor_t next_import_int_sub_tensor =
1499 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],
1500 tile_current_import.shape,
1501 next_import_layout,
1502 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
1503 tile_current_import.offset);
1504
1505 const TTL_const_int_short_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;
1506 const TTL_ext_short_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;
1507
1508 if (TTL_tile_empty(tile_current_import) == false)
1509 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
1510 *TTL_to_void_tensor(&next_import_ext_tensor),
1511 &(*duplex_buffering->events)[0]);
1512
1513 if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)
1514 TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),
1515 *TTL_to_void_tensor(&next_export_ext_tensor),
1516 &(*duplex_buffering->events)[1]);
1517
1518 const TTL_layout_t int_export_layout =
1519 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);
1520 const TTL_ext_short_tensor_t to_export_to =
1522 tile_current_export.shape,
1523 duplex_buffering->common.ext_tensor_out.layout,
1524 tile_current_export.offset,
1525 duplex_buffering->common.ext_tensor_out.elem_size);
1526 const TTL_int_short_sub_tensor_t to_export_from =
1527 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],
1528 tile_current_export.shape,
1529 int_export_layout,
1530 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
1531 tile_current_export.offset);
1532
1533 duplex_buffering->prev_out_tensors.to_export_to = to_export_to;
1534 duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);
1535
1536 TTL_wait(2, *duplex_buffering->events);
1537
1538 return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);
1539}
1540
1541static inline void __attribute__((overloadable)) TTL_finish_buffering(
1542 TTL_duplex_const_short_tensor_buffering_t *const duplex_buffering) {
1544}
1545/*
1546 * TTL_duplex_scheme.h
1547 *
1548 * Copyright (c) 2023 Mobileye
1549 *
1550 * Licensed under the Apache License, Version 2.0 (the License);
1551 * you may not use this file except in compliance with the License.
1552 * You may obtain a copy of the License at
1553 *
1554 * http://www.apache.org/licenses/LICENSE-2.0
1555 *
1556 * Unless required by applicable law or agreed to in writing, software
1557 * distributed under the License is distributed on an AS IS BASIS,
1558 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1559 * See the License for the specific language governing permissions and
1560 * limitations under the License.
1561 */
1562
1563// clang-format off
1564/**
1565 * @file
1566 *
1567 * Given pair of blocking import and export that can execute concurrently,
1568 * TTL_duplex_buffering issues them together and then waits on both to complete,
1569 * hopefully executing them in parallel to each other. This scheme uses two
1570 * internal buffers, one for the import and one for the export. Note that the
1571 * export is pipelined to pair the import of the current tile with the export of
1572 * previous tile.
1573
1574 * The following table draws the pipelined actions performed in duplex buffering.
1575 * It specifies which tile is processed in each iteration:
1576 *
1577 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |
1578 * |-------------------|-----|-----|----------------------|---------------|
1579 * | **Import** | 0 | 1 | i | |
1580 * | **Wait Import** | 0 | 1 | i | |
1581 * | **Compute** | 0 | 1 | i | |
1582 * | **Export** | | 0 | i-1 | NumOfTiles-1 |
1583 * | **WaitExport** | | 0 | i-1 | NumOfTiles-1 |
1584 *
1585 * Notice the epilog (\#NumOfTiles) which is an extra iteration.
1586 *
1587 * When including this file the following must be defined
1588 *
1589 * #define TTL_TENSOR_TYPE void
1590 * #define TTL_TENSOR_TYPE uchar
1591 * etc
1592 *
1593 * @example TTL_duplex_buffering.cl
1594 */
1595// clang-format on
1596
1597// This file presumes that the following have been pre included.
1598// this is not done here for path reasons.
1599// #include "TTL_core.h"
1600// #include "TTL_import_export.h"
1601// #include TTL_IMPORT_EXPORT_INCLUDE_H
1602
1603/**
1604 * @def The structs used for this buffering type
1605 */
1606
1607/**
1608 * @brief Data required to perform duplex buffer pipelining.
1609 *
1610 * @see TTL_start_duplex_buffering for a description of duplex buffer
1611 * pipelining.
1612 */
1613typedef struct {
1614 struct {
1615 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
1616 0->1->0->1... etc */
1617 __local ushort *int_base[2]; /*!< The internal base addresses of the pipelined tiles. */
1618 TTL_ext_ushort_tensor_t ext_tensor_in; /*!< The external tensor being input */
1619 TTL_ext_ushort_tensor_t ext_tensor_out; /*!< The external tensor being output */
1620 } common; ///< The information that is common to all pipeline schemes
1621
1622 TTL_event_t (*events)[2]; ///< 2 Events are required, 1 first is used for
1623 ///< external to internal transfers, the second for
1624 ///< internal to external transfers
1625
1626 /**
1627 * @brief Store of the buffers used for the previous import/export cycles.
1628 *
1629 */
1630 struct {
1633 } prev_out_tensors;
1635
1636/*
1637 * Predeclare TTL_step_buffering.
1638 */
1639static inline TTL_io_ushort_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1640 TTL_duplex_const_ushort_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,
1641 TTL_tile_t tile_current_export);
1642
1643/**
1644 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process
1645 *
1646 * @param ext_tensor_in A tensor describing the input in global memory
1647 * @param int_base_in The address of the local import buffer.
1648 * @param ext_tensor_out A tensor describing the output in global memory
1649 * @param int_base_out The address of the local export buffer.
1650 * @param events A pointer to a list of 2 events.
1651 * The first event in the list will be used for imports, the second event in
1652 * the list will be used for exports.
1653 * @param first_tile The first tile to fetch for the scheme
1654 *
1655 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.
1656 *
1657 * The first event in the list will be used for imports,
1658 * the second event in the list will be used for exports.
1659 * \n\n Example:
1660 * @code
1661 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};
1662 *
1663 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(
1664 * ext_base_in, ext_layout_in, l_buffers[0],
1665 * ext_base_out, ext_layout_out, l_buffers[1],
1666 * &events);
1667 * @endcode
1668 * \n
1669 *
1670 * @return The TTL_duplex_buffering_t created from the input parameters.
1671 *
1672 * Solid description of duplex buffering here.
1673 *
1674 * The simplest form of duplex buffering takes the following flow.
1675 *
1676 * @startuml
1677 *
1678 * start
1679 *
1680 * :Create a TTL_tiler_t with TTL_create_tiler;
1681 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers
1682 * 1 input buffer, 1 output buffer;
1683 * :NumberOfTiles = TTL_number_of_tiles(tiler);
1684 *
1685 * while (for each tile)
1686 *
1687 * :Import The Next Tile into the input buffer;
1688 *
1689 * :Process the Tile from the input buffer to the output buffer;
1690 *
1691 * :ExportThe Process Tile from into the output buffer;
1692 *
1693 * endwhile
1694 *
1695 * stop
1696 *
1697 * @enduml
1698 *
1699 * This can be optimized and standardized using the TTL_step_buffering
1700 * call.
1701 *
1702 * @startuml
1703 *
1704 * start
1705 *
1706 * :Create a TTL_tiler_t with TTL_create_tiler;
1707 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;
1708 * :NumberOfTiles = TTL_number_of_tiles(tiler);
1709 *
1710 * while (for each tile)
1711 *
1712 * :Call TTL_step_buffering for the current tile
1713 *
1714 * This will import the current new tile and export the last tile
1715 * in parallel;
1716 *
1717 * if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)
1718 * :Process the Tile from the input buffer to the output buffer;
1719 * endif
1720 *
1721 * endwhile
1722 *
1723 * stop
1724 *
1725 * @enduml
1726 */
1728 TTL_ext_ushort_tensor_t ext_tensor_in, __local ushort *int_base_in, TTL_ext_ushort_tensor_t ext_tensor_out,
1729 __local ushort *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {
1731 result.common.int_base[0] = int_base_in;
1732 result.common.int_base[1] = int_base_out;
1733
1734 result.common.ext_tensor_in = ext_tensor_in;
1735 result.common.ext_tensor_out = ext_tensor_out;
1736 result.events = events;
1739
1740 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
1741
1742 return result;
1743}
1744
1745static inline TTL_io_ushort_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1746 TTL_duplex_const_ushort_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,
1747 TTL_tile_t tile_current_export) {
1748 const TTL_layout_t next_import_layout =
1749 TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);
1750 const TTL_const_ext_ushort_tensor_t next_import_ext_tensor =
1752 tile_current_import.shape,
1753 duplex_buffering->common.ext_tensor_in.layout,
1754 tile_current_import.offset,
1755 duplex_buffering->common.ext_tensor_in.elem_size);
1756 const TTL_int_ushort_sub_tensor_t next_import_int_sub_tensor =
1757 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],
1758 tile_current_import.shape,
1759 next_import_layout,
1760 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
1761 tile_current_import.offset);
1762
1763 const TTL_const_int_ushort_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;
1764 const TTL_ext_ushort_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;
1765
1766 if (TTL_tile_empty(tile_current_import) == false)
1767 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
1768 *TTL_to_void_tensor(&next_import_ext_tensor),
1769 &(*duplex_buffering->events)[0]);
1770
1771 if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)
1772 TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),
1773 *TTL_to_void_tensor(&next_export_ext_tensor),
1774 &(*duplex_buffering->events)[1]);
1775
1776 const TTL_layout_t int_export_layout =
1777 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);
1778 const TTL_ext_ushort_tensor_t to_export_to =
1780 tile_current_export.shape,
1781 duplex_buffering->common.ext_tensor_out.layout,
1782 tile_current_export.offset,
1783 duplex_buffering->common.ext_tensor_out.elem_size);
1784 const TTL_int_ushort_sub_tensor_t to_export_from =
1785 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],
1786 tile_current_export.shape,
1787 int_export_layout,
1788 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
1789 tile_current_export.offset);
1790
1791 duplex_buffering->prev_out_tensors.to_export_to = to_export_to;
1792 duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);
1793
1794 TTL_wait(2, *duplex_buffering->events);
1795
1796 return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);
1797}
1798
1799static inline void __attribute__((overloadable)) TTL_finish_buffering(
1800 TTL_duplex_const_ushort_tensor_buffering_t *const duplex_buffering) {
1802}
1803/*
1804 * TTL_duplex_scheme.h
1805 *
1806 * Copyright (c) 2023 Mobileye
1807 *
1808 * Licensed under the Apache License, Version 2.0 (the License);
1809 * you may not use this file except in compliance with the License.
1810 * You may obtain a copy of the License at
1811 *
1812 * http://www.apache.org/licenses/LICENSE-2.0
1813 *
1814 * Unless required by applicable law or agreed to in writing, software
1815 * distributed under the License is distributed on an AS IS BASIS,
1816 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
1817 * See the License for the specific language governing permissions and
1818 * limitations under the License.
1819 */
1820
1821// clang-format off
1822/**
1823 * @file
1824 *
1825 * Given pair of blocking import and export that can execute concurrently,
1826 * TTL_duplex_buffering issues them together and then waits on both to complete,
1827 * hopefully executing them in parallel to each other. This scheme uses two
1828 * internal buffers, one for the import and one for the export. Note that the
1829 * export is pipelined to pair the import of the current tile with the export of
1830 * previous tile.
1831
1832 * The following table draws the pipelined actions performed in duplex buffering.
1833 * It specifies which tile is processed in each iteration:
1834 *
1835 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |
1836 * |-------------------|-----|-----|----------------------|---------------|
1837 * | **Import** | 0 | 1 | i | |
1838 * | **Wait Import** | 0 | 1 | i | |
1839 * | **Compute** | 0 | 1 | i | |
1840 * | **Export** | | 0 | i-1 | NumOfTiles-1 |
1841 * | **WaitExport** | | 0 | i-1 | NumOfTiles-1 |
1842 *
1843 * Notice the epilog (\#NumOfTiles) which is an extra iteration.
1844 *
1845 * When including this file the following must be defined
1846 *
1847 * #define TTL_TENSOR_TYPE void
1848 * #define TTL_TENSOR_TYPE uchar
1849 * etc
1850 *
1851 * @example TTL_duplex_buffering.cl
1852 */
1853// clang-format on
1854
1855// This file presumes that the following have been pre included.
1856// this is not done here for path reasons.
1857// #include "TTL_core.h"
1858// #include "TTL_import_export.h"
1859// #include TTL_IMPORT_EXPORT_INCLUDE_H
1860
1861/**
1862 * @def The structs used for this buffering type
1863 */
1864
1865/**
1866 * @brief Data required to perform duplex buffer pipelining.
1867 *
1868 * @see TTL_start_duplex_buffering for a description of duplex buffer
1869 * pipelining.
1870 */
1871typedef struct {
1872 struct {
1873 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
1874 0->1->0->1... etc */
1875 __local long *int_base[2]; /*!< The internal base addresses of the pipelined tiles. */
1876 TTL_ext_long_tensor_t ext_tensor_in; /*!< The external tensor being input */
1877 TTL_ext_long_tensor_t ext_tensor_out; /*!< The external tensor being output */
1878 } common; ///< The information that is common to all pipeline schemes
1879
1880 TTL_event_t (*events)[2]; ///< 2 Events are required, 1 first is used for
1881 ///< external to internal transfers, the second for
1882 ///< internal to external transfers
1883
1884 /**
1885 * @brief Store of the buffers used for the previous import/export cycles.
1886 *
1887 */
1888 struct {
1891 } prev_out_tensors;
1893
1894/*
1895 * Predeclare TTL_step_buffering.
1896 */
1897static inline TTL_io_long_tensor_t __attribute__((overloadable)) TTL_step_buffering(
1898 TTL_duplex_const_long_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,
1899 TTL_tile_t tile_current_export);
1900
1901/**
1902 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process
1903 *
1904 * @param ext_tensor_in A tensor describing the input in global memory
1905 * @param int_base_in The address of the local import buffer.
1906 * @param ext_tensor_out A tensor describing the output in global memory
1907 * @param int_base_out The address of the local export buffer.
1908 * @param events A pointer to a list of 2 events.
1909 * The first event in the list will be used for imports, the second event in
1910 * the list will be used for exports.
1911 * @param first_tile The first tile to fetch for the scheme
1912 *
1913 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.
1914 *
1915 * The first event in the list will be used for imports,
1916 * the second event in the list will be used for exports.
1917 * \n\n Example:
1918 * @code
1919 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};
1920 *
1921 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(
1922 * ext_base_in, ext_layout_in, l_buffers[0],
1923 * ext_base_out, ext_layout_out, l_buffers[1],
1924 * &events);
1925 * @endcode
1926 * \n
1927 *
1928 * @return The TTL_duplex_buffering_t created from the input parameters.
1929 *
1930 * Solid description of duplex buffering here.
1931 *
1932 * The simplest form of duplex buffering takes the following flow.
1933 *
1934 * @startuml
1935 *
1936 * start
1937 *
1938 * :Create a TTL_tiler_t with TTL_create_tiler;
1939 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers
1940 * 1 input buffer, 1 output buffer;
1941 * :NumberOfTiles = TTL_number_of_tiles(tiler);
1942 *
1943 * while (for each tile)
1944 *
1945 * :Import The Next Tile into the input buffer;
1946 *
1947 * :Process the Tile from the input buffer to the output buffer;
1948 *
1949 * :ExportThe Process Tile from into the output buffer;
1950 *
1951 * endwhile
1952 *
1953 * stop
1954 *
1955 * @enduml
1956 *
1957 * This can be optimized and standardized using the TTL_step_buffering
1958 * call.
1959 *
1960 * @startuml
1961 *
1962 * start
1963 *
1964 * :Create a TTL_tiler_t with TTL_create_tiler;
1965 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;
1966 * :NumberOfTiles = TTL_number_of_tiles(tiler);
1967 *
1968 * while (for each tile)
1969 *
1970 * :Call TTL_step_buffering for the current tile
1971 *
1972 * This will import the current new tile and export the last tile
1973 * in parallel;
1974 *
1975 * if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)
1976 * :Process the Tile from the input buffer to the output buffer;
1977 * endif
1978 *
1979 * endwhile
1980 *
1981 * stop
1982 *
1983 * @enduml
1984 */
1986 TTL_ext_long_tensor_t ext_tensor_in, __local long *int_base_in, TTL_ext_long_tensor_t ext_tensor_out,
1987 __local long *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {
1989 result.common.int_base[0] = int_base_in;
1990 result.common.int_base[1] = int_base_out;
1991
1992 result.common.ext_tensor_in = ext_tensor_in;
1993 result.common.ext_tensor_out = ext_tensor_out;
1994 result.events = events;
1997
1998 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
1999
2000 return result;
2001}
2002
2003static inline TTL_io_long_tensor_t __attribute__((overloadable)) TTL_step_buffering(
2004 TTL_duplex_const_long_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,
2005 TTL_tile_t tile_current_export) {
2006 const TTL_layout_t next_import_layout =
2007 TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);
2008 const TTL_const_ext_long_tensor_t next_import_ext_tensor =
2010 tile_current_import.shape,
2011 duplex_buffering->common.ext_tensor_in.layout,
2012 tile_current_import.offset,
2013 duplex_buffering->common.ext_tensor_in.elem_size);
2014 const TTL_int_long_sub_tensor_t next_import_int_sub_tensor =
2015 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],
2016 tile_current_import.shape,
2017 next_import_layout,
2018 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
2019 tile_current_import.offset);
2020
2021 const TTL_const_int_long_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;
2022 const TTL_ext_long_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;
2023
2024 if (TTL_tile_empty(tile_current_import) == false)
2025 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
2026 *TTL_to_void_tensor(&next_import_ext_tensor),
2027 &(*duplex_buffering->events)[0]);
2028
2029 if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)
2030 TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),
2031 *TTL_to_void_tensor(&next_export_ext_tensor),
2032 &(*duplex_buffering->events)[1]);
2033
2034 const TTL_layout_t int_export_layout =
2035 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);
2036 const TTL_ext_long_tensor_t to_export_to = TTL_create_ext_tensor(duplex_buffering->common.ext_tensor_out.base,
2037 tile_current_export.shape,
2038 duplex_buffering->common.ext_tensor_out.layout,
2039 tile_current_export.offset,
2040 duplex_buffering->common.ext_tensor_out.elem_size);
2041 const TTL_int_long_sub_tensor_t to_export_from =
2042 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],
2043 tile_current_export.shape,
2044 int_export_layout,
2045 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
2046 tile_current_export.offset);
2047
2048 duplex_buffering->prev_out_tensors.to_export_to = to_export_to;
2049 duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);
2050
2051 TTL_wait(2, *duplex_buffering->events);
2052
2053 return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);
2054}
2055
2056static inline void __attribute__((overloadable)) TTL_finish_buffering(
2057 TTL_duplex_const_long_tensor_buffering_t *const duplex_buffering) {
2059}
2060/*
2061 * TTL_duplex_scheme.h
2062 *
2063 * Copyright (c) 2023 Mobileye
2064 *
2065 * Licensed under the Apache License, Version 2.0 (the License);
2066 * you may not use this file except in compliance with the License.
2067 * You may obtain a copy of the License at
2068 *
2069 * http://www.apache.org/licenses/LICENSE-2.0
2070 *
2071 * Unless required by applicable law or agreed to in writing, software
2072 * distributed under the License is distributed on an AS IS BASIS,
2073 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
2074 * See the License for the specific language governing permissions and
2075 * limitations under the License.
2076 */
2077
2078// clang-format off
2079/**
2080 * @file
2081 *
2082 * Given pair of blocking import and export that can execute concurrently,
2083 * TTL_duplex_buffering issues them together and then waits on both to complete,
2084 * hopefully executing them in parallel to each other. This scheme uses two
2085 * internal buffers, one for the import and one for the export. Note that the
2086 * export is pipelined to pair the import of the current tile with the export of
2087 * previous tile.
2088
2089 * The following table draws the pipelined actions performed in duplex buffering.
2090 * It specifies which tile is processed in each iteration:
2091 *
2092 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |
2093 * |-------------------|-----|-----|----------------------|---------------|
2094 * | **Import** | 0 | 1 | i | |
2095 * | **Wait Import** | 0 | 1 | i | |
2096 * | **Compute** | 0 | 1 | i | |
2097 * | **Export** | | 0 | i-1 | NumOfTiles-1 |
2098 * | **WaitExport** | | 0 | i-1 | NumOfTiles-1 |
2099 *
2100 * Notice the epilog (\#NumOfTiles) which is an extra iteration.
2101 *
2102 * When including this file the following must be defined
2103 *
2104 * #define TTL_TENSOR_TYPE void
2105 * #define TTL_TENSOR_TYPE uchar
2106 * etc
2107 *
2108 * @example TTL_duplex_buffering.cl
2109 */
2110// clang-format on
2111
2112// This file presumes that the following have been pre included.
2113// this is not done here for path reasons.
2114// #include "TTL_core.h"
2115// #include "TTL_import_export.h"
2116// #include TTL_IMPORT_EXPORT_INCLUDE_H
2117
2118/**
2119 * @def The structs used for this buffering type
2120 */
2121
2122/**
2123 * @brief Data required to perform duplex buffer pipelining.
2124 *
2125 * @see TTL_start_duplex_buffering for a description of duplex buffer
2126 * pipelining.
2127 */
2128typedef struct {
2129 struct {
2130 int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double
2131 0->1->0->1... etc */
2132 __local ulong *int_base[2]; /*!< The internal base addresses of the pipelined tiles. */
2133 TTL_ext_ulong_tensor_t ext_tensor_in; /*!< The external tensor being input */
2134 TTL_ext_ulong_tensor_t ext_tensor_out; /*!< The external tensor being output */
2135 } common; ///< The information that is common to all pipeline schemes
2136
2137 TTL_event_t (*events)[2]; ///< 2 Events are required, 1 first is used for
2138 ///< external to internal transfers, the second for
2139 ///< internal to external transfers
2140
2141 /**
2142 * @brief Store of the buffers used for the previous import/export cycles.
2143 *
2144 */
2145 struct {
2148 } prev_out_tensors;
2150
2151/*
2152 * Predeclare TTL_step_buffering.
2153 */
2154static inline TTL_io_ulong_tensor_t __attribute__((overloadable)) TTL_step_buffering(
2155 TTL_duplex_const_ulong_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,
2156 TTL_tile_t tile_current_export);
2157
2158/**
2159 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process
2160 *
2161 * @param ext_tensor_in A tensor describing the input in global memory
2162 * @param int_base_in The address of the local import buffer.
2163 * @param ext_tensor_out A tensor describing the output in global memory
2164 * @param int_base_out The address of the local export buffer.
2165 * @param events A pointer to a list of 2 events.
2166 * The first event in the list will be used for imports, the second event in
2167 * the list will be used for exports.
2168 * @param first_tile The first tile to fetch for the scheme
2169 *
2170 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.
2171 *
2172 * The first event in the list will be used for imports,
2173 * the second event in the list will be used for exports.
2174 * \n\n Example:
2175 * @code
2176 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};
2177 *
2178 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(
2179 * ext_base_in, ext_layout_in, l_buffers[0],
2180 * ext_base_out, ext_layout_out, l_buffers[1],
2181 * &events);
2182 * @endcode
2183 * \n
2184 *
2185 * @return The TTL_duplex_buffering_t created from the input parameters.
2186 *
2187 * Solid description of duplex buffering here.
2188 *
2189 * The simplest form of duplex buffering takes the following flow.
2190 *
2191 * @startuml
2192 *
2193 * start
2194 *
2195 * :Create a TTL_tiler_t with TTL_create_tiler;
2196 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers
2197 * 1 input buffer, 1 output buffer;
2198 * :NumberOfTiles = TTL_number_of_tiles(tiler);
2199 *
2200 * while (for each tile)
2201 *
2202 * :Import The Next Tile into the input buffer;
2203 *
2204 * :Process the Tile from the input buffer to the output buffer;
2205 *
2206 * :ExportThe Process Tile from into the output buffer;
2207 *
2208 * endwhile
2209 *
2210 * stop
2211 *
2212 * @enduml
2213 *
2214 * This can be optimized and standardized using the TTL_step_buffering
2215 * call.
2216 *
2217 * @startuml
2218 *
2219 * start
2220 *
2221 * :Create a TTL_tiler_t with TTL_create_tiler;
2222 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;
2223 * :NumberOfTiles = TTL_number_of_tiles(tiler);
2224 *
2225 * while (for each tile)
2226 *
2227 * :Call TTL_step_buffering for the current tile
2228 *
2229 * This will import the current new tile and export the last tile
2230 * in parallel;
2231 *
2232 * if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)
2233 * :Process the Tile from the input buffer to the output buffer;
2234 * endif
2235 *
2236 * endwhile
2237 *
2238 * stop
2239 *
2240 * @enduml
2241 */
2243 TTL_ext_ulong_tensor_t ext_tensor_in, __local ulong *int_base_in, TTL_ext_ulong_tensor_t ext_tensor_out,
2244 __local ulong *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {
2246 result.common.int_base[0] = int_base_in;
2247 result.common.int_base[1] = int_base_out;
2248
2249 result.common.ext_tensor_in = ext_tensor_in;
2250 result.common.ext_tensor_out = ext_tensor_out;
2251 result.events = events;
2254
2255 TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());
2256
2257 return result;
2258}
2259
2260static inline TTL_io_ulong_tensor_t __attribute__((overloadable)) TTL_step_buffering(
2261 TTL_duplex_const_ulong_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,
2262 TTL_tile_t tile_current_export) {
2263 const TTL_layout_t next_import_layout =
2264 TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);
2265 const TTL_const_ext_ulong_tensor_t next_import_ext_tensor =
2267 tile_current_import.shape,
2268 duplex_buffering->common.ext_tensor_in.layout,
2269 tile_current_import.offset,
2270 duplex_buffering->common.ext_tensor_in.elem_size);
2271 const TTL_int_ulong_sub_tensor_t next_import_int_sub_tensor =
2272 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],
2273 tile_current_import.shape,
2274 next_import_layout,
2275 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
2276 tile_current_import.offset);
2277
2278 const TTL_const_int_ulong_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;
2279 const TTL_ext_ulong_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;
2280
2281 if (TTL_tile_empty(tile_current_import) == false)
2282 TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),
2283 *TTL_to_void_tensor(&next_import_ext_tensor),
2284 &(*duplex_buffering->events)[0]);
2285
2286 if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)
2287 TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),
2288 *TTL_to_void_tensor(&next_export_ext_tensor),
2289 &(*duplex_buffering->events)[1]);
2290
2291 const TTL_layout_t int_export_layout =
2292 TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);
2293 const TTL_ext_ulong_tensor_t to_export_to =
2295 tile_current_export.shape,
2296 duplex_buffering->common.ext_tensor_out.layout,
2297 tile_current_export.offset,
2298 duplex_buffering->common.ext_tensor_out.elem_size);
2299 const TTL_int_ulong_sub_tensor_t to_export_from =
2300 TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],
2301 tile_current_export.shape,
2302 int_export_layout,
2303 *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),
2304 tile_current_export.offset);
2305
2306 duplex_buffering->prev_out_tensors.to_export_to = to_export_to;
2307 duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);
2308
2309 TTL_wait(2, *duplex_buffering->events);
2310
2311 return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);
2312}
2313
2314static inline void __attribute__((overloadable)) TTL_finish_buffering(
2315 TTL_duplex_const_ulong_tensor_buffering_t *const duplex_buffering) {
2317}
static TTL_io_void_tensor_t TTL_step_buffering(TTL_duplex_const_void_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import, TTL_tile_t tile_current_export)
static void TTL_finish_buffering(TTL_duplex_const_void_tensor_buffering_t *const duplex_buffering)
static TTL_duplex_const_void_tensor_buffering_t TTL_start_duplex_buffering(TTL_ext_void_tensor_t ext_tensor_in, __local void *int_base_in, TTL_ext_void_tensor_t ext_tensor_out, __local void *int_base_out, TTL_event_t(*events)[2], TTL_tile_t first_tile)
Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process.
static TTL_ext_void_tensor_t TTL_create_empty_ext_tensor(__global void *unused)
static TTL_ext_void_tensor_t TTL_create_ext_tensor(__global void *base, const TTL_shape_t shape, const TTL_layout_t layout, const TTL_offset_t offset, const TTL_dim_t elem_size)
const and non-const tensor creation functions.
static TTL_const_int_void_tensor_t TTL_create_empty_const_int_tensor(__local void *unused)
static const TTL_const_ext_void_tensor_t * TTL_to_const_tensor(const TTL_ext_void_tensor_t *const tensor)
static TTL_int_void_sub_tensor_t TTL_create_int_sub_tensor(__local void *base, const TTL_shape_t shape, const TTL_layout_t layout, const TTL_dim_t elem_size, const TTL_offset_t offset, const TTL_shape_t origin_shape, TTL_offset_t origin_offset)
const and non-const sub tensor creation functions.
static bool TTL_const_int_tensor_empty(TTL_const_int_void_tensor_t tensor)
static const TTL_ext_void_tensor_t * TTL_to_void_tensor(const TTL_ext_void_tensor_t *tensor)
static const TTL_ext_void_sub_tensor_t * TTL_to_void_sub_tensor(const TTL_ext_void_sub_tensor_t *tensor)
static TTL_const_ext_void_tensor_t TTL_create_const_ext_tensor(__global const void *base, const TTL_shape_t shape, const TTL_layout_t layout, const TTL_offset_t offset, const TTL_dim_t elem_size)
create TTL_create_int_tensor_impl
static TTL_io_void_tensor_t TTL_create_io_tensors(TTL_int_void_sub_tensor_t imported_to, TTL_int_void_sub_tensor_t to_export_from)
Create a TTL_io_tensors_t from a pair of tensors.
static TTL_layout_t TTL_create_layout(void)
Create a 1D Description of a Tensor layout in memory.
static int TTL_tile_empty(TTL_tile_t tile)
Check if the tile passed is empty.
Definition TTL_tiles.h:257
static TTL_tile_t TTL_create_empty_tile()
Create an empty tile. Empty means it has all dimensions set to zero.
Definition TTL_tiles.h:267
static void TTL_import_sub_tensor(const TTL_int_void_sub_tensor_t internal_sub_tensor, const TTL_const_ext_void_tensor_t const_external_tensor, TTL_event_t *event)
Implementation of TTL_import_sub_tensor.
static void TTL_export(const TTL_const_int_void_tensor_t internal_tensor, const TTL_ext_void_tensor_t external_tensor, TTL_event_t *event)
Export the external tensor to the internal tensor returning when complete.
event_t TTL_event_t
TTL_event_t is a pseudonym for OpenCL event_t.
#define __global
The opencl __global namespace is not supported in C.
Definition c/TTL_types.h:26
#define __local
The opencl __local namespace is not supported in C.
Definition c/TTL_types.h:27
unsigned char uchar
opencl and so TTL supports a type called uchar which is not part of C
Definition c/TTL_types.h:25
unsigned long ulong
OpenCL supports ulong so provide the same in c.
Definition c/TTL_types.h:32
unsigned int uint
OpenCL supports uint so provide the same in c.
Definition c/TTL_types.h:30
unsigned short ushort
OpenCL supports ushort so provide the same in c.
Definition c/TTL_types.h:31
static void TTL_wait(const int num_events, TTL_event_t *const events)
Data required to perform duplex buffer pipelining.
struct TTL_duplex_const_char_tensor_buffering_t::@064104237117137030117227234105063000235053070337 prev_out_tensors
Store of the buffers used for the previous import/export cycles.
struct TTL_duplex_const_char_tensor_buffering_t::@014370002005176330316356021025342266164365046356 common
The information that is common to all pipeline schemes.
TTL_const_int_char_tensor_t to_export_from
Data required to perform duplex buffer pipelining.
TTL_const_int_int_tensor_t to_export_from
struct TTL_duplex_const_int_tensor_buffering_t::@113001270154235351227363303241326346012173053057 common
The information that is common to all pipeline schemes.
struct TTL_duplex_const_int_tensor_buffering_t::@071005333241035110075205003246037135223123217036 prev_out_tensors
Store of the buffers used for the previous import/export cycles.
Data required to perform duplex buffer pipelining.
struct TTL_duplex_const_long_tensor_buffering_t::@227070227032076302161007067253066031170102114367 prev_out_tensors
Store of the buffers used for the previous import/export cycles.
struct TTL_duplex_const_long_tensor_buffering_t::@036044252256072122062226156361073353275230253054 common
The information that is common to all pipeline schemes.
Data required to perform duplex buffer pipelining.
struct TTL_duplex_const_short_tensor_buffering_t::@024114216310342034132013367354036153175252073102 common
The information that is common to all pipeline schemes.
struct TTL_duplex_const_short_tensor_buffering_t::@264127032051024102251273212160044224162003153127 prev_out_tensors
Store of the buffers used for the previous import/export cycles.
TTL_const_int_short_tensor_t to_export_from
Data required to perform duplex buffer pipelining.
struct TTL_duplex_const_uchar_tensor_buffering_t::@354204013312214235270256355316357054356270012252 common
The information that is common to all pipeline schemes.
TTL_const_int_uchar_tensor_t to_export_from
struct TTL_duplex_const_uchar_tensor_buffering_t::@301074276244055064044151350374040042161001165123 prev_out_tensors
Store of the buffers used for the previous import/export cycles.
Data required to perform duplex buffer pipelining.
struct TTL_duplex_const_uint_tensor_buffering_t::@103023100132272271317246162104057304263260357164 prev_out_tensors
Store of the buffers used for the previous import/export cycles.
struct TTL_duplex_const_uint_tensor_buffering_t::@321372255121242324312262334377057314147273272231 common
The information that is common to all pipeline schemes.
Data required to perform duplex buffer pipelining.
TTL_const_int_ulong_tensor_t to_export_from
struct TTL_duplex_const_ulong_tensor_buffering_t::@043337237103076030355350231302054342062327342307 prev_out_tensors
Store of the buffers used for the previous import/export cycles.
struct TTL_duplex_const_ulong_tensor_buffering_t::@377050367376170127244301220122253317167004362016 common
The information that is common to all pipeline schemes.
Data required to perform duplex buffer pipelining.
TTL_const_int_ushort_tensor_t to_export_from
struct TTL_duplex_const_ushort_tensor_buffering_t::@242202362352162060271227351266124343164075120060 prev_out_tensors
Store of the buffers used for the previous import/export cycles.
struct TTL_duplex_const_ushort_tensor_buffering_t::@050015212130100113116344316351156367204324277022 common
The information that is common to all pipeline schemes.
Data required to perform duplex buffer pipelining.
struct TTL_duplex_const_void_tensor_buffering_t::@010107256062204117376046335161267266024366237051 common
The information that is common to all pipeline schemes.
TTL_const_int_void_tensor_t to_export_from
struct TTL_duplex_const_void_tensor_buffering_t::@001051164243227154221035355330234222317337335265 prev_out_tensors
Store of the buffers used for the previous import/export cycles.
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
const and non-const sub tensors in the appropriate address space
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Describes a pair of internal Tensors after an operation.
Description of a Tensor layout in memory.
TTL_dim_t width
Number of elements along dimension x.
TTL_dim_t height
Number of rows along dimension y.
TTL_offset_t offset
Definition TTL_tiles.h:126
TTL_shape_t shape
Definition TTL_tiles.h:125