Tensor Tiling Library
 
Loading...
Searching...
No Matches
p/pipelines/TTL_duplex_scheme.h
Go to the documentation of this file.
1/*
2 * TTL_duplex_scheme.h
3 *
4 * Copyright (c) 2025 Mobileye
5 *
6 * Licensed under the Apache License, Version 2.0 (the License);
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
9 *
10 * http://www.apache.org/licenses/LICENSE-2.0
11 *
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an AS IS BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
17 */
18
19// clang-format off
20/**
21 * @file
22 *
23 * Given pair of blocking import and export that can execute concurrently,
24 * TTL_duplex_buffering issues them together and then waits on both to complete,
25 * hopefully executing them in parallel to each other. This scheme uses two
26 * internal buffers, one for the import and one for the export. Note that the
27 * export is pipelined to pair the import of the current tile with the export of
28 * previous tile.
29
30 * The following table draws the pipelined actions performed in duplex buffering.
31 * It specifies which tile is processed in each iteration:
32 *
33 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |
34 * |-------------------|-----|-----|----------------------|---------------|
35 * | **Import** | 0 | 1 | i | |
36 * | **Wait Import** | 0 | 1 | i | |
37 * | **Compute** | 0 | 1 | i | |
38 * | **Export** | | 0 | i-1 | NumOfTiles-1 |
39 * | **WaitExport** | | 0 | i-1 | NumOfTiles-1 |
40 *
41 * Notice the epilog (\#NumOfTiles) which is an extra iteration.
42 *
43 * When including this file the following must be defined
44 *
45 * #define TTL_TENSOR_TYPE void
46 * #define TTL_TENSOR_TYPE uchar
47 * etc
48 *
49 * @example TTL_duplex_buffering.cl
50 */
51
53#include "TTL_schemes_common.h"
54
55/**
56 * @brief Data required to perform duplex buffer pipelining.
57 *
58 * @see TTL_start_duplex_buffering for a description of duplex buffer
59 * pipelining.
60 */
61template <typename TENSORTYPE>
63 /**
64 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process
65 *
66 * @param ext_tensor_in A tensor describing the input in global memory
67 * @param int_base_in The address of the local import buffer.
68 * @param ext_tensor_out A tensor describing the output in global memory
69 * @param int_base_out The address of the local export buffer.
70 * @param m_events A pointer to a list of 2 m_events.
71 * The first event in the list will be used for imports, the second event in
72 * the list will be used for exports.
73 * @param first_tile The first tile to fetch for the scheme
74 *
75 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.
76 *
77 * The first event in the list will be used for imports,
78 * the second event in the list will be used for exports.
79 * \n\n Example:
80 * @code
81 * TTL_event m_events[2] = { TTL_get_event(), TTL_get_event()};
82 *
83 * TTL_duplex_buffering buffering_scheme(
84 * ext_base_in, ext_layout_in, l_buffers[0],
85 * ext_base_out, ext_layout_out, l_buffers[1],
86 * &m_events);
87 * @endcode
88 * \n
89 *
90 * @return The TTL_duplex_buffering created from the input parameters.
91 *
92 * Solid description of duplex buffering here.
93 *
94 * The simplest form of duplex buffering takes the following flow.
95 *
96 * @startuml
97 *
98 * start
99 *
100 * :Create a TTL_tiler with TTL_create_tiler;
101 * :Create a TTL_duplex_buffering Structure with 2 Buffers
102 * 1 input buffer, 1 output buffer;
103 * :NumberOfTiles = TTL_number_of_tiles(tiler);
104 *
105 * while (for each tile)
106 *
107 * :Import The Next Tile into the input buffer;
108 *
109 * :Process the Tile from the input buffer to the output buffer;
110 *
111 * :ExportThe Process Tile from into the output buffer;
112 *
113 * endwhile
114 *
115 * stop
116 *
117 * @enduml
118 *
119 * This can be optimized and standardized using the step_buffering
120 * call.
121 *
122 * @startuml
123 *
124 * start
125 *
126 * :Create a TTL_tiler with TTL_create_tiler;
127 * :Create a TTL_duplex_buffering Structure with 2 Buffers 1 input buffer, 1 output buffer;
128 * :NumberOfTiles = TTL_number_of_tiles(tiler);
129 *
130 * while (for each tile)
131 *
132 * :Call step_buffering for the current tile
133 *
134 * This will import the current new tile and export the last tile
135 * in parallel;
136 *
137 * if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)
138 * :Process the Tile from the input buffer to the output buffer;
139 * endif
140 *
141 * endwhile
142 *
143 * stop
144 *
145 * @enduml
146 */
147 TTL_duplex_buffering(TTL_tensor<TENSORTYPE> ext_tensor_in, TTL_local(TENSORTYPE *) int_base_in,
148 TTL_tensor<TENSORTYPE> ext_tensor_out, TTL_local(TENSORTYPE *) int_base_out,
149 TTL_event (*input_events)[2], TTL_tile first_tile) {
150 m_common.int_base [IMPORT_BUFFER] = int_base_in;
151 m_common.int_base [EXPORT_BUFFER] = int_base_out;
152
153 m_common.ext_tensor_in = ext_tensor_in;
154 m_common.ext_tensor_out = ext_tensor_out;
155 m_events = input_events;
158
159 step_buffering(first_tile, TTL_tile());
160 }
161
162 TTL_io_tensors<TENSORTYPE> step_buffering(TTL_tile tile_current_import, TTL_tile tile_current_export) {
163 const TTL_layout next_import_layout(tile_current_import.shape.width, tile_current_import.shape.height);
164 const TTL_tensor<TENSORTYPE> next_import_ext_tensor(m_common.ext_tensor_in.base,
165 tile_current_import.shape,
166 m_common.ext_tensor_in.layout,
167 tile_current_import.offset,
168 m_common.ext_tensor_in.elem_size);
169 const TTL_sub_tensor<TENSORTYPE> next_import_int_sub_tensor(m_common.int_base [IMPORT_BUFFER],
170 tile_current_import.shape,
171 next_import_layout,
172 m_common.ext_tensor_in,
173 tile_current_import.offset);
174
175 const TTL_tensor<TENSORTYPE> next_export_int_tensor = m_prev_out_tensors.to_export_from;
176 const TTL_tensor<TENSORTYPE> next_export_ext_tensor = m_prev_out_tensors.to_export_to;
177
178 if (tile_current_import.empty() == false)
179 TTL_import_sub_tensor(next_import_int_sub_tensor,
180 next_import_ext_tensor,
181 &(*m_events) [IMPORT_BUFFER]);
182
183 if (m_prev_out_tensors.to_export_from.empty() == false)
184 TTL_export(next_export_int_tensor,
185 next_export_ext_tensor,
186 &(*m_events) [EXPORT_BUFFER]);
187
188 const TTL_layout int_export_layout(tile_current_export.shape.width, tile_current_export.shape.height);
189 const TTL_tensor<TENSORTYPE> to_export_to(m_common.ext_tensor_out.base,
190 tile_current_export.shape,
191 m_common.ext_tensor_out.layout,
192 tile_current_export.offset,
193 m_common.ext_tensor_out.elem_size);
195 tile_current_export.shape,
196 int_export_layout,
197 m_common.ext_tensor_in,
198 tile_current_export.offset);
199
200 m_prev_out_tensors.to_export_to = to_export_to;
201 m_prev_out_tensors.to_export_from = to_export_from.tensor;
202
203 TTL_wait(2, *m_events);
204
205 return TTL_io_tensors(next_import_int_sub_tensor, to_export_from);
206 }
207
208 /**
209 * @brief Complete any transfers required to finish the buffering process.
210 *
211 * Any transfers that are still in progress will be completed and any transfers
212 * that need to be started and completed before finish_buffering returns
213 */
217
218 TTL_common_buffering<TENSORTYPE, 2> m_common; ///< The information that is m_common to all pipeline schemes
219
220 /**
221 * @brief Indexes to use for importing and exporting of data.
222 */
223 static constexpr unsigned int IMPORT_BUFFER = 0;
224 static constexpr unsigned int EXPORT_BUFFER = 1;
225
226 TTL_event (* m_events)[2]; ///< 2 m_events are required, 1 first is used for
227 ///< external to internal transfers, the second for
228 ///< internal to external transfers
229
230 /**
231 * @brief Store of the buffers used for the previous import/export cycles.
232 *
233 */
234 struct {
238};
static void TTL_wait(const int num_events, TTL_event_t *const events)
event_t TTL_event
TTL_event is a pseudonym for OpenCL event_t.
#define TTL_local(type)
Create a typed reference in the __local address space.
static void TTL_import_sub_tensor(const TTL_int_void_sub_tensor_t internal_sub_tensor, const TTL_const_ext_void_tensor_t const_external_tensor, TTL_event_t *event)
Implementation of TTL_import_sub_tensor.
static void TTL_export(const TTL_const_int_void_tensor_t internal_tensor, const TTL_ext_void_tensor_t external_tensor, TTL_event_t *event)
Export the external tensor to the internal tensor returning when complete.
TTL_duplex_buffering(TTL_tensor< TENSORTYPE > ext_tensor_in, TTL_local(TENSORTYPE *) int_base_in, TTL_tensor< TENSORTYPE > ext_tensor_out, TTL_local(TENSORTYPE *) int_base_out, TTL_event(*input_events)[2], TTL_tile first_tile)
Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process.
TTL_io_tensors< TENSORTYPE > step_buffering(TTL_tile tile_current_import, TTL_tile tile_current_export)
TTL_tensor< TENSORTYPE > to_export_to
TTL_common_buffering< TENSORTYPE, 2 > m_common
The information that is m_common to all pipeline schemes.
static constexpr unsigned int IMPORT_BUFFER
Indexes to use for importing and exporting of data.
struct TTL_duplex_buffering::@107345071274067227120266144277122261200055226157 m_prev_out_tensors
Store of the buffers used for the previous import/export cycles.
static constexpr unsigned int EXPORT_BUFFER
TTL_tensor< TENSORTYPE > to_export_from
void finish_buffering()
Complete any transfers required to finish the buffering process.
Describes a pair of internal Tensors after an operation.
Description of a Tensor layout in memory.
TTL_dim height
Number of rows along dimension y.
TTL_dim width
Number of elements along dimension x.
A tensor plus its reference to its parent tensor.
A poor mans base class for an a tensor in the passed address space.
TTL_offset offset
TTL_shape shape
bool empty() const
Check if the tile passed is empty.