OpenCL-TTL/html/TTL__duplex__scheme_8h_source.html

/*

 * TTL_duplex_scheme.h

 *

 * Copyright (c) 2023 Mobileye

 *

 * Licensed under the Apache License, Version 2.0 (the License);

 * you may not use this file except in compliance with the License.

 * You may obtain a copy of the License at

 *

 *     http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an AS IS BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


// clang-format off

/**

 * @file

 *

 * Given pair of blocking import and export that can execute concurrently,

 * TTL_duplex_buffering issues them together and then waits on both to complete,

 * hopefully executing them in parallel to each other. This scheme uses two

 * internal buffers, one for the import and one for the export. Note that the

 * export is pipelined to pair the import of the current tile with the export of

 * previous tile.


 * The following table draws the pipelined actions performed in duplex buffering.

 * It specifies which tile is processed in each iteration:

 *

 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |

 * |-------------------|-----|-----|----------------------|---------------|

 * | **Import**        | 0   | 1   | i                    |               |

 * | **Wait Import**   | 0   | 1   | i                    |               |

 * | **Compute**       | 0   | 1   | i                    |               |

 * | **Export**        |     | 0   | i-1                  | NumOfTiles-1  |

 * | **WaitExport**    |     | 0   | i-1                  | NumOfTiles-1  |

 *

 * Notice the epilog (\#NumOfTiles) which is an extra iteration.

 *

 * When including this file the following must be defined

 *

 * #define TTL_TENSOR_TYPE void

 * #define TTL_TENSOR_TYPE uchar

 * etc

 *

 * @example TTL_duplex_buffering.cl

 */

// clang-format on


// This file presumes that the following have been pre included.

// this is not done here for path reasons.

// #include "TTL_core.h"

// #include "TTL_import_export.h"

// #include TTL_IMPORT_EXPORT_INCLUDE_H


/**

 * @def The structs used for this buffering type

 */


/**

 * @brief Data required to perform duplex buffer pipelining.

 *

 * @see TTL_start_duplex_buffering for a description of duplex buffer

 * pipelining.

 */


typedef struct {

    struct {

        int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double

                 0->1->0->1... etc */

        __local void *int_base[2];            /*!< The internal base addresses of the pipelined tiles. */

        TTL_ext_void_tensor_t ext_tensor_in;  /*!< The external tensor being input */

        TTL_ext_void_tensor_t ext_tensor_out; /*!< The external tensor being output */

    } common;                                 ///< The information that is common to all pipeline schemes


    TTL_event_t (*events)[2];  ///< 2 Events are required, 1 first is used for

                               ///< external to internal transfers, the second for

                               ///< internal to external transfers


    /**

     * @brief Store of the buffers used for the previous import/export cycles.

     *

     */

    struct {

        TTL_ext_void_tensor_t to_export_to;

        TTL_const_int_void_tensor_t to_export_from;

    } prev_out_tensors;

} TTL_duplex_const_void_tensor_buffering_t;


/*

 * Predeclare TTL_step_buffering.

 */

static inline TTL_io_void_tensor_t __attribute__((overloadable)) TTL_step_buffering(

    TTL_duplex_const_void_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,

    TTL_tile_t tile_current_export);


/**

 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process

 *

 * @param ext_tensor_in A tensor describing the input in global memory

 * @param int_base_in The address of the local import buffer.

 * @param ext_tensor_out A tensor describing the output in global memory

 * @param int_base_out The address of the local export buffer.

 * @param events A pointer to a list of 2 events.

 * The first event in the list will be used for imports, the second event in

 * the list will be used for exports.

 * @param first_tile The first tile to fetch for the scheme

 *

 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.

 *

 * The first event in the list will be used for imports,

 * the second event in the list will be used for exports.

 * \n\n Example:

 * @code

 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};

 *

 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(

 *     ext_base_in, ext_layout_in, l_buffers[0],

 *     ext_base_out, ext_layout_out, l_buffers[1],

 *     &events);

 * @endcode

 * \n

 *

 * @return The TTL_duplex_buffering_t created from the input parameters.

 *

 * Solid description of duplex buffering here.

 *

 * The simplest form of duplex buffering takes the following flow.

 *

 * @startuml

 *

 * start

 *

 * :Create a TTL_tiler_t with TTL_create_tiler;

 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers

 * 1 input buffer, 1 output buffer;

 * :NumberOfTiles = TTL_number_of_tiles(tiler);

 *

 * while (for each tile)

 *

 *   :Import The Next Tile into the input buffer;

 *

 *   :Process the Tile from the input buffer to the output buffer;

 *

 *   :ExportThe Process Tile from into the output buffer;

 *

 * endwhile

 *

 * stop

 *

 * @enduml

 *

 * This can be optimized and standardized using the TTL_step_buffering

 * call.

 *

 * @startuml

 *

 * start

 *

 * :Create a TTL_tiler_t with TTL_create_tiler;

 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;

 * :NumberOfTiles = TTL_number_of_tiles(tiler);

 *

 * while (for each tile)

 *

 *   :Call TTL_step_buffering for the current tile

 *

 *    This will import the current new tile and export the last tile

 *    in parallel;

 *

 *   if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)

 *      :Process the Tile from the input buffer to the output buffer;

 *   endif

 *

 * endwhile

 *

 * stop

 *

 * @enduml

 */


static inline TTL_duplex_const_void_tensor_buffering_t __attribute__((overloadable)) TTL_start_duplex_buffering(

    TTL_ext_void_tensor_t ext_tensor_in, __local void *int_base_in, TTL_ext_void_tensor_t ext_tensor_out,

    __local void *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {

    TTL_duplex_const_void_tensor_buffering_t result;

    result.common.int_base[0] = int_base_in;

    result.common.int_base[1] = int_base_out;


    result.common.ext_tensor_in = ext_tensor_in;

    result.common.ext_tensor_out = ext_tensor_out;

    result.events = events;

    result.prev_out_tensors.to_export_to = TTL_create_empty_ext_tensor((__global void *)0);

    result.prev_out_tensors.to_export_from = TTL_create_empty_const_int_tensor((__local void *)0);


    TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());


    return result;

}


static inline TTL_io_void_tensor_t __attribute__((overloadable)) TTL_step_buffering(

    TTL_duplex_const_void_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,

    TTL_tile_t tile_current_export) {

    const TTL_layout_t next_import_layout =

        TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);

    const TTL_const_ext_void_tensor_t next_import_ext_tensor =

        TTL_create_const_ext_tensor(duplex_buffering->common.ext_tensor_in.base,

                                    tile_current_import.shape,

                                    duplex_buffering->common.ext_tensor_in.layout,

                                    tile_current_import.offset,

                                    duplex_buffering->common.ext_tensor_in.elem_size);

    const TTL_int_void_sub_tensor_t next_import_int_sub_tensor =

        TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],

                                  tile_current_import.shape,

                                  next_import_layout,

                                  *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),

                                  tile_current_import.offset);


    const TTL_const_int_void_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;

    const TTL_ext_void_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;


    if (TTL_tile_empty(tile_current_import) == false)

        TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),

                              *TTL_to_void_tensor(&next_import_ext_tensor),

                              &(*duplex_buffering->events)[0]);


    if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)

        TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),

                   *TTL_to_void_tensor(&next_export_ext_tensor),

                   &(*duplex_buffering->events)[1]);


    const TTL_layout_t int_export_layout =

        TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);

    const TTL_ext_void_tensor_t to_export_to = TTL_create_ext_tensor(duplex_buffering->common.ext_tensor_out.base,

                                                                     tile_current_export.shape,

                                                                     duplex_buffering->common.ext_tensor_out.layout,

                                                                     tile_current_export.offset,

                                                                     duplex_buffering->common.ext_tensor_out.elem_size);

    const TTL_int_void_sub_tensor_t to_export_from =

        TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],

                                  tile_current_export.shape,

                                  int_export_layout,

                                  *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),

                                  tile_current_export.offset);


    duplex_buffering->prev_out_tensors.to_export_to = to_export_to;

    duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);


    TTL_wait(2, *duplex_buffering->events);


    return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);

}


static inline void __attribute__((overloadable)) TTL_finish_buffering(

    TTL_duplex_const_void_tensor_buffering_t *const duplex_buffering) {

    TTL_step_buffering(duplex_buffering, TTL_create_empty_tile(), TTL_create_empty_tile());

}


/*

 * TTL_duplex_scheme.h

 *

 * Copyright (c) 2023 Mobileye

 *

 * Licensed under the Apache License, Version 2.0 (the License);

 * you may not use this file except in compliance with the License.

 * You may obtain a copy of the License at

 *

 *     http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an AS IS BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


// clang-format off

/**

 * @file

 *

 * Given pair of blocking import and export that can execute concurrently,

 * TTL_duplex_buffering issues them together and then waits on both to complete,

 * hopefully executing them in parallel to each other. This scheme uses two

 * internal buffers, one for the import and one for the export. Note that the

 * export is pipelined to pair the import of the current tile with the export of

 * previous tile.


 * The following table draws the pipelined actions performed in duplex buffering.

 * It specifies which tile is processed in each iteration:

 *

 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |

 * |-------------------|-----|-----|----------------------|---------------|

 * | **Import**        | 0   | 1   | i                    |               |

 * | **Wait Import**   | 0   | 1   | i                    |               |

 * | **Compute**       | 0   | 1   | i                    |               |

 * | **Export**        |     | 0   | i-1                  | NumOfTiles-1  |

 * | **WaitExport**    |     | 0   | i-1                  | NumOfTiles-1  |

 *

 * Notice the epilog (\#NumOfTiles) which is an extra iteration.

 *

 * When including this file the following must be defined

 *

 * #define TTL_TENSOR_TYPE void

 * #define TTL_TENSOR_TYPE uchar

 * etc

 *

 * @example TTL_duplex_buffering.cl

 */

// clang-format on


// This file presumes that the following have been pre included.

// this is not done here for path reasons.

// #include "TTL_core.h"

// #include "TTL_import_export.h"

// #include TTL_IMPORT_EXPORT_INCLUDE_H


/**

 * @def The structs used for this buffering type

 */


/**

 * @brief Data required to perform duplex buffer pipelining.

 *

 * @see TTL_start_duplex_buffering for a description of duplex buffer

 * pipelining.

 */


typedef struct {

    struct {

        int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double

                 0->1->0->1... etc */

        __local char *int_base[2];            /*!< The internal base addresses of the pipelined tiles. */

        TTL_ext_char_tensor_t ext_tensor_in;  /*!< The external tensor being input */

        TTL_ext_char_tensor_t ext_tensor_out; /*!< The external tensor being output */

    } common;                                 ///< The information that is common to all pipeline schemes


    TTL_event_t (*events)[2];  ///< 2 Events are required, 1 first is used for

                               ///< external to internal transfers, the second for

                               ///< internal to external transfers


    /**

     * @brief Store of the buffers used for the previous import/export cycles.

     *

     */

    struct {

        TTL_ext_char_tensor_t to_export_to;

        TTL_const_int_char_tensor_t to_export_from;

    } prev_out_tensors;

} TTL_duplex_const_char_tensor_buffering_t;


/*

 * Predeclare TTL_step_buffering.

 */

static inline TTL_io_char_tensor_t __attribute__((overloadable)) TTL_step_buffering(

    TTL_duplex_const_char_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,

    TTL_tile_t tile_current_export);


/**

 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process

 *

 * @param ext_tensor_in A tensor describing the input in global memory

 * @param int_base_in The address of the local import buffer.

 * @param ext_tensor_out A tensor describing the output in global memory

 * @param int_base_out The address of the local export buffer.

 * @param events A pointer to a list of 2 events.

 * The first event in the list will be used for imports, the second event in

 * the list will be used for exports.

 * @param first_tile The first tile to fetch for the scheme

 *

 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.

 *

 * The first event in the list will be used for imports,

 * the second event in the list will be used for exports.

 * \n\n Example:

 * @code

 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};

 *

 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(

 *     ext_base_in, ext_layout_in, l_buffers[0],

 *     ext_base_out, ext_layout_out, l_buffers[1],

 *     &events);

 * @endcode

 * \n

 *

 * @return The TTL_duplex_buffering_t created from the input parameters.

 *

 * Solid description of duplex buffering here.

 *

 * The simplest form of duplex buffering takes the following flow.

 *

 * @startuml

 *

 * start

 *

 * :Create a TTL_tiler_t with TTL_create_tiler;

 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers

 * 1 input buffer, 1 output buffer;

 * :NumberOfTiles = TTL_number_of_tiles(tiler);

 *

 * while (for each tile)

 *

 *   :Import The Next Tile into the input buffer;

 *

 *   :Process the Tile from the input buffer to the output buffer;

 *

 *   :ExportThe Process Tile from into the output buffer;

 *

 * endwhile

 *

 * stop

 *

 * @enduml

 *

 * This can be optimized and standardized using the TTL_step_buffering

 * call.

 *

 * @startuml

 *

 * start

 *

 * :Create a TTL_tiler_t with TTL_create_tiler;

 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;

 * :NumberOfTiles = TTL_number_of_tiles(tiler);

 *

 * while (for each tile)

 *

 *   :Call TTL_step_buffering for the current tile

 *

 *    This will import the current new tile and export the last tile

 *    in parallel;

 *

 *   if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)

 *      :Process the Tile from the input buffer to the output buffer;

 *   endif

 *

 * endwhile

 *

 * stop

 *

 * @enduml

 */


static inline TTL_duplex_const_char_tensor_buffering_t __attribute__((overloadable)) TTL_start_duplex_buffering(

    TTL_ext_char_tensor_t ext_tensor_in, __local char *int_base_in, TTL_ext_char_tensor_t ext_tensor_out,

    __local char *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {

    TTL_duplex_const_char_tensor_buffering_t result;

    result.common.int_base[0] = int_base_in;

    result.common.int_base[1] = int_base_out;


    result.common.ext_tensor_in = ext_tensor_in;

    result.common.ext_tensor_out = ext_tensor_out;

    result.events = events;

    result.prev_out_tensors.to_export_to = TTL_create_empty_ext_tensor((__global char *)0);

    result.prev_out_tensors.to_export_from = TTL_create_empty_const_int_tensor((__local char *)0);


    TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());


    return result;

}


static inline TTL_io_char_tensor_t __attribute__((overloadable)) TTL_step_buffering(

    TTL_duplex_const_char_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,

    TTL_tile_t tile_current_export) {

    const TTL_layout_t next_import_layout =

        TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);

    const TTL_const_ext_char_tensor_t next_import_ext_tensor =

        TTL_create_const_ext_tensor(duplex_buffering->common.ext_tensor_in.base,

                                    tile_current_import.shape,

                                    duplex_buffering->common.ext_tensor_in.layout,

                                    tile_current_import.offset,

                                    duplex_buffering->common.ext_tensor_in.elem_size);

    const TTL_int_char_sub_tensor_t next_import_int_sub_tensor =

        TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],

                                  tile_current_import.shape,

                                  next_import_layout,

                                  *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),

                                  tile_current_import.offset);


    const TTL_const_int_char_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;

    const TTL_ext_char_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;


    if (TTL_tile_empty(tile_current_import) == false)

        TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),

                              *TTL_to_void_tensor(&next_import_ext_tensor),

                              &(*duplex_buffering->events)[0]);


    if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)

        TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),

                   *TTL_to_void_tensor(&next_export_ext_tensor),

                   &(*duplex_buffering->events)[1]);


    const TTL_layout_t int_export_layout =

        TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);

    const TTL_ext_char_tensor_t to_export_to = TTL_create_ext_tensor(duplex_buffering->common.ext_tensor_out.base,

                                                                     tile_current_export.shape,

                                                                     duplex_buffering->common.ext_tensor_out.layout,

                                                                     tile_current_export.offset,

                                                                     duplex_buffering->common.ext_tensor_out.elem_size);

    const TTL_int_char_sub_tensor_t to_export_from =

        TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],

                                  tile_current_export.shape,

                                  int_export_layout,

                                  *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),

                                  tile_current_export.offset);


    duplex_buffering->prev_out_tensors.to_export_to = to_export_to;

    duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);


    TTL_wait(2, *duplex_buffering->events);


    return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);

}


static inline void __attribute__((overloadable)) TTL_finish_buffering(

    TTL_duplex_const_char_tensor_buffering_t *const duplex_buffering) {

    TTL_step_buffering(duplex_buffering, TTL_create_empty_tile(), TTL_create_empty_tile());

}


/*

 * TTL_duplex_scheme.h

 *

 * Copyright (c) 2023 Mobileye

 *

 * Licensed under the Apache License, Version 2.0 (the License);

 * you may not use this file except in compliance with the License.

 * You may obtain a copy of the License at

 *

 *     http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an AS IS BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


// clang-format off

/**

 * @file

 *

 * Given pair of blocking import and export that can execute concurrently,

 * TTL_duplex_buffering issues them together and then waits on both to complete,

 * hopefully executing them in parallel to each other. This scheme uses two

 * internal buffers, one for the import and one for the export. Note that the

 * export is pipelined to pair the import of the current tile with the export of

 * previous tile.


 * The following table draws the pipelined actions performed in duplex buffering.

 * It specifies which tile is processed in each iteration:

 *

 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |

 * |-------------------|-----|-----|----------------------|---------------|

 * | **Import**        | 0   | 1   | i                    |               |

 * | **Wait Import**   | 0   | 1   | i                    |               |

 * | **Compute**       | 0   | 1   | i                    |               |

 * | **Export**        |     | 0   | i-1                  | NumOfTiles-1  |

 * | **WaitExport**    |     | 0   | i-1                  | NumOfTiles-1  |

 *

 * Notice the epilog (\#NumOfTiles) which is an extra iteration.

 *

 * When including this file the following must be defined

 *

 * #define TTL_TENSOR_TYPE void

 * #define TTL_TENSOR_TYPE uchar

 * etc

 *

 * @example TTL_duplex_buffering.cl

 */

// clang-format on


// This file presumes that the following have been pre included.

// this is not done here for path reasons.

// #include "TTL_core.h"

// #include "TTL_import_export.h"

// #include TTL_IMPORT_EXPORT_INCLUDE_H


/**

 * @def The structs used for this buffering type

 */


/**

 * @brief Data required to perform duplex buffer pipelining.

 *

 * @see TTL_start_duplex_buffering for a description of duplex buffer

 * pipelining.

 */


typedef struct {

    struct {

        int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double

                 0->1->0->1... etc */

        __local uchar *int_base[2];            /*!< The internal base addresses of the pipelined tiles. */

        TTL_ext_uchar_tensor_t ext_tensor_in;  /*!< The external tensor being input */

        TTL_ext_uchar_tensor_t ext_tensor_out; /*!< The external tensor being output */

    } common;                                  ///< The information that is common to all pipeline schemes


    TTL_event_t (*events)[2];  ///< 2 Events are required, 1 first is used for

                               ///< external to internal transfers, the second for

                               ///< internal to external transfers


    /**

     * @brief Store of the buffers used for the previous import/export cycles.

     *

     */

    struct {

        TTL_ext_uchar_tensor_t to_export_to;

        TTL_const_int_uchar_tensor_t to_export_from;

    } prev_out_tensors;

} TTL_duplex_const_uchar_tensor_buffering_t;


/*

 * Predeclare TTL_step_buffering.

 */

static inline TTL_io_uchar_tensor_t __attribute__((overloadable)) TTL_step_buffering(

    TTL_duplex_const_uchar_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,

    TTL_tile_t tile_current_export);


/**

 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process

 *

 * @param ext_tensor_in A tensor describing the input in global memory

 * @param int_base_in The address of the local import buffer.

 * @param ext_tensor_out A tensor describing the output in global memory

 * @param int_base_out The address of the local export buffer.

 * @param events A pointer to a list of 2 events.

 * The first event in the list will be used for imports, the second event in

 * the list will be used for exports.

 * @param first_tile The first tile to fetch for the scheme

 *

 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.

 *

 * The first event in the list will be used for imports,

 * the second event in the list will be used for exports.

 * \n\n Example:

 * @code

 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};

 *

 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(

 *     ext_base_in, ext_layout_in, l_buffers[0],

 *     ext_base_out, ext_layout_out, l_buffers[1],

 *     &events);

 * @endcode

 * \n

 *

 * @return The TTL_duplex_buffering_t created from the input parameters.

 *

 * Solid description of duplex buffering here.

 *

 * The simplest form of duplex buffering takes the following flow.

 *

 * @startuml

 *

 * start

 *

 * :Create a TTL_tiler_t with TTL_create_tiler;

 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers

 * 1 input buffer, 1 output buffer;

 * :NumberOfTiles = TTL_number_of_tiles(tiler);

 *

 * while (for each tile)

 *

 *   :Import The Next Tile into the input buffer;

 *

 *   :Process the Tile from the input buffer to the output buffer;

 *

 *   :ExportThe Process Tile from into the output buffer;

 *

 * endwhile

 *

 * stop

 *

 * @enduml

 *

 * This can be optimized and standardized using the TTL_step_buffering

 * call.

 *

 * @startuml

 *

 * start

 *

 * :Create a TTL_tiler_t with TTL_create_tiler;

 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;

 * :NumberOfTiles = TTL_number_of_tiles(tiler);

 *

 * while (for each tile)

 *

 *   :Call TTL_step_buffering for the current tile

 *

 *    This will import the current new tile and export the last tile

 *    in parallel;

 *

 *   if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)

 *      :Process the Tile from the input buffer to the output buffer;

 *   endif

 *

 * endwhile

 *

 * stop

 *

 * @enduml

 */


static inline TTL_duplex_const_uchar_tensor_buffering_t __attribute__((overloadable)) TTL_start_duplex_buffering(

    TTL_ext_uchar_tensor_t ext_tensor_in, __local uchar *int_base_in, TTL_ext_uchar_tensor_t ext_tensor_out,

    __local uchar *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {

    TTL_duplex_const_uchar_tensor_buffering_t result;

    result.common.int_base[0] = int_base_in;

    result.common.int_base[1] = int_base_out;


    result.common.ext_tensor_in = ext_tensor_in;

    result.common.ext_tensor_out = ext_tensor_out;

    result.events = events;

    result.prev_out_tensors.to_export_to = TTL_create_empty_ext_tensor((__global uchar *)0);

    result.prev_out_tensors.to_export_from = TTL_create_empty_const_int_tensor((__local uchar *)0);


    TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());


    return result;

}


static inline TTL_io_uchar_tensor_t __attribute__((overloadable)) TTL_step_buffering(

    TTL_duplex_const_uchar_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,

    TTL_tile_t tile_current_export) {

    const TTL_layout_t next_import_layout =

        TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);

    const TTL_const_ext_uchar_tensor_t next_import_ext_tensor =

        TTL_create_const_ext_tensor(duplex_buffering->common.ext_tensor_in.base,

                                    tile_current_import.shape,

                                    duplex_buffering->common.ext_tensor_in.layout,

                                    tile_current_import.offset,

                                    duplex_buffering->common.ext_tensor_in.elem_size);

    const TTL_int_uchar_sub_tensor_t next_import_int_sub_tensor =

        TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],

                                  tile_current_import.shape,

                                  next_import_layout,

                                  *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),

                                  tile_current_import.offset);


    const TTL_const_int_uchar_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;

    const TTL_ext_uchar_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;


    if (TTL_tile_empty(tile_current_import) == false)

        TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),

                              *TTL_to_void_tensor(&next_import_ext_tensor),

                              &(*duplex_buffering->events)[0]);


    if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)

        TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),

                   *TTL_to_void_tensor(&next_export_ext_tensor),

                   &(*duplex_buffering->events)[1]);


    const TTL_layout_t int_export_layout =

        TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);

    const TTL_ext_uchar_tensor_t to_export_to =

        TTL_create_ext_tensor(duplex_buffering->common.ext_tensor_out.base,

                              tile_current_export.shape,

                              duplex_buffering->common.ext_tensor_out.layout,

                              tile_current_export.offset,

                              duplex_buffering->common.ext_tensor_out.elem_size);

    const TTL_int_uchar_sub_tensor_t to_export_from =

        TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],

                                  tile_current_export.shape,

                                  int_export_layout,

                                  *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),

                                  tile_current_export.offset);


    duplex_buffering->prev_out_tensors.to_export_to = to_export_to;

    duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);


    TTL_wait(2, *duplex_buffering->events);


    return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);

}


static inline void __attribute__((overloadable)) TTL_finish_buffering(

    TTL_duplex_const_uchar_tensor_buffering_t *const duplex_buffering) {

    TTL_step_buffering(duplex_buffering, TTL_create_empty_tile(), TTL_create_empty_tile());

}


/*

 * TTL_duplex_scheme.h

 *

 * Copyright (c) 2023 Mobileye

 *

 * Licensed under the Apache License, Version 2.0 (the License);

 * you may not use this file except in compliance with the License.

 * You may obtain a copy of the License at

 *

 *     http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an AS IS BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


// clang-format off

/**

 * @file

 *

 * Given pair of blocking import and export that can execute concurrently,

 * TTL_duplex_buffering issues them together and then waits on both to complete,

 * hopefully executing them in parallel to each other. This scheme uses two

 * internal buffers, one for the import and one for the export. Note that the

 * export is pipelined to pair the import of the current tile with the export of

 * previous tile.


 * The following table draws the pipelined actions performed in duplex buffering.

 * It specifies which tile is processed in each iteration:

 *

 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |

 * |-------------------|-----|-----|----------------------|---------------|

 * | **Import**        | 0   | 1   | i                    |               |

 * | **Wait Import**   | 0   | 1   | i                    |               |

 * | **Compute**       | 0   | 1   | i                    |               |

 * | **Export**        |     | 0   | i-1                  | NumOfTiles-1  |

 * | **WaitExport**    |     | 0   | i-1                  | NumOfTiles-1  |

 *

 * Notice the epilog (\#NumOfTiles) which is an extra iteration.

 *

 * When including this file the following must be defined

 *

 * #define TTL_TENSOR_TYPE void

 * #define TTL_TENSOR_TYPE uchar

 * etc

 *

 * @example TTL_duplex_buffering.cl

 */

// clang-format on


// This file presumes that the following have been pre included.

// this is not done here for path reasons.

// #include "TTL_core.h"

// #include "TTL_import_export.h"

// #include TTL_IMPORT_EXPORT_INCLUDE_H


/**

 * @def The structs used for this buffering type

 */


/**

 * @brief Data required to perform duplex buffer pipelining.

 *

 * @see TTL_start_duplex_buffering for a description of duplex buffer

 * pipelining.

 */


typedef struct {

    struct {

        int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double

                 0->1->0->1... etc */

        __local int *int_base[2];            /*!< The internal base addresses of the pipelined tiles. */

        TTL_ext_int_tensor_t ext_tensor_in;  /*!< The external tensor being input */

        TTL_ext_int_tensor_t ext_tensor_out; /*!< The external tensor being output */

    } common;                                ///< The information that is common to all pipeline schemes


    TTL_event_t (*events)[2];  ///< 2 Events are required, 1 first is used for

                               ///< external to internal transfers, the second for

                               ///< internal to external transfers


    /**

     * @brief Store of the buffers used for the previous import/export cycles.

     *

     */

    struct {

        TTL_ext_int_tensor_t to_export_to;

        TTL_const_int_int_tensor_t to_export_from;

    } prev_out_tensors;

} TTL_duplex_const_int_tensor_buffering_t;


/*

 * Predeclare TTL_step_buffering.

 */

static inline TTL_io_int_tensor_t __attribute__((overloadable)) TTL_step_buffering(

    TTL_duplex_const_int_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,

    TTL_tile_t tile_current_export);


/**

 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process

 *

 * @param ext_tensor_in A tensor describing the input in global memory

 * @param int_base_in The address of the local import buffer.

 * @param ext_tensor_out A tensor describing the output in global memory

 * @param int_base_out The address of the local export buffer.

 * @param events A pointer to a list of 2 events.

 * The first event in the list will be used for imports, the second event in

 * the list will be used for exports.

 * @param first_tile The first tile to fetch for the scheme

 *

 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.

 *

 * The first event in the list will be used for imports,

 * the second event in the list will be used for exports.

 * \n\n Example:

 * @code

 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};

 *

 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(

 *     ext_base_in, ext_layout_in, l_buffers[0],

 *     ext_base_out, ext_layout_out, l_buffers[1],

 *     &events);

 * @endcode

 * \n

 *

 * @return The TTL_duplex_buffering_t created from the input parameters.

 *

 * Solid description of duplex buffering here.

 *

 * The simplest form of duplex buffering takes the following flow.

 *

 * @startuml

 *

 * start

 *

 * :Create a TTL_tiler_t with TTL_create_tiler;

 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers

 * 1 input buffer, 1 output buffer;

 * :NumberOfTiles = TTL_number_of_tiles(tiler);

 *

 * while (for each tile)

 *

 *   :Import The Next Tile into the input buffer;

 *

 *   :Process the Tile from the input buffer to the output buffer;

 *

 *   :ExportThe Process Tile from into the output buffer;

 *

 * endwhile

 *

 * stop

 *

 * @enduml

 *

 * This can be optimized and standardized using the TTL_step_buffering

 * call.

 *

 * @startuml

 *

 * start

 *

 * :Create a TTL_tiler_t with TTL_create_tiler;

 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;

 * :NumberOfTiles = TTL_number_of_tiles(tiler);

 *

 * while (for each tile)

 *

 *   :Call TTL_step_buffering for the current tile

 *

 *    This will import the current new tile and export the last tile

 *    in parallel;

 *

 *   if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)

 *      :Process the Tile from the input buffer to the output buffer;

 *   endif

 *

 * endwhile

 *

 * stop

 *

 * @enduml

 */


static inline TTL_duplex_const_int_tensor_buffering_t __attribute__((overloadable)) TTL_start_duplex_buffering(

    TTL_ext_int_tensor_t ext_tensor_in, __local int *int_base_in, TTL_ext_int_tensor_t ext_tensor_out,

    __local int *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {

    TTL_duplex_const_int_tensor_buffering_t result;

    result.common.int_base[0] = int_base_in;

    result.common.int_base[1] = int_base_out;


    result.common.ext_tensor_in = ext_tensor_in;

    result.common.ext_tensor_out = ext_tensor_out;

    result.events = events;

    result.prev_out_tensors.to_export_to = TTL_create_empty_ext_tensor((__global int *)0);

    result.prev_out_tensors.to_export_from = TTL_create_empty_const_int_tensor((__local int *)0);


    TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());


    return result;

}


static inline TTL_io_int_tensor_t __attribute__((overloadable)) TTL_step_buffering(

    TTL_duplex_const_int_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,

    TTL_tile_t tile_current_export) {

    const TTL_layout_t next_import_layout =

        TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);

    const TTL_const_ext_int_tensor_t next_import_ext_tensor =

        TTL_create_const_ext_tensor(duplex_buffering->common.ext_tensor_in.base,

                                    tile_current_import.shape,

                                    duplex_buffering->common.ext_tensor_in.layout,

                                    tile_current_import.offset,

                                    duplex_buffering->common.ext_tensor_in.elem_size);

    const TTL_int_int_sub_tensor_t next_import_int_sub_tensor =

        TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],

                                  tile_current_import.shape,

                                  next_import_layout,

                                  *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),

                                  tile_current_import.offset);


    const TTL_const_int_int_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;

    const TTL_ext_int_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;


    if (TTL_tile_empty(tile_current_import) == false)

        TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),

                              *TTL_to_void_tensor(&next_import_ext_tensor),

                              &(*duplex_buffering->events)[0]);


    if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)

        TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),

                   *TTL_to_void_tensor(&next_export_ext_tensor),

                   &(*duplex_buffering->events)[1]);


    const TTL_layout_t int_export_layout =

        TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);

    const TTL_ext_int_tensor_t to_export_to = TTL_create_ext_tensor(duplex_buffering->common.ext_tensor_out.base,

                                                                    tile_current_export.shape,

                                                                    duplex_buffering->common.ext_tensor_out.layout,

                                                                    tile_current_export.offset,

                                                                    duplex_buffering->common.ext_tensor_out.elem_size);

    const TTL_int_int_sub_tensor_t to_export_from =

        TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],

                                  tile_current_export.shape,

                                  int_export_layout,

                                  *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),

                                  tile_current_export.offset);


    duplex_buffering->prev_out_tensors.to_export_to = to_export_to;

    duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);


    TTL_wait(2, *duplex_buffering->events);


    return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);

}


static inline void __attribute__((overloadable)) TTL_finish_buffering(

    TTL_duplex_const_int_tensor_buffering_t *const duplex_buffering) {

    TTL_step_buffering(duplex_buffering, TTL_create_empty_tile(), TTL_create_empty_tile());

}


/*

 * TTL_duplex_scheme.h

 *

 * Copyright (c) 2023 Mobileye

 *

 * Licensed under the Apache License, Version 2.0 (the License);

 * you may not use this file except in compliance with the License.

 * You may obtain a copy of the License at

 *

 *     http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an AS IS BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


// clang-format off

/**

 * @file

 *

 * Given pair of blocking import and export that can execute concurrently,

 * TTL_duplex_buffering issues them together and then waits on both to complete,

 * hopefully executing them in parallel to each other. This scheme uses two

 * internal buffers, one for the import and one for the export. Note that the

 * export is pipelined to pair the import of the current tile with the export of

 * previous tile.


 * The following table draws the pipelined actions performed in duplex buffering.

 * It specifies which tile is processed in each iteration:

 *

 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |

 * |-------------------|-----|-----|----------------------|---------------|

 * | **Import**        | 0   | 1   | i                    |               |

 * | **Wait Import**   | 0   | 1   | i                    |               |

 * | **Compute**       | 0   | 1   | i                    |               |

 * | **Export**        |     | 0   | i-1                  | NumOfTiles-1  |

 * | **WaitExport**    |     | 0   | i-1                  | NumOfTiles-1  |

 *

 * Notice the epilog (\#NumOfTiles) which is an extra iteration.

 *

 * When including this file the following must be defined

 *

 * #define TTL_TENSOR_TYPE void

 * #define TTL_TENSOR_TYPE uchar

 * etc

 *

 * @example TTL_duplex_buffering.cl

 */

// clang-format on


// This file presumes that the following have been pre included.

// this is not done here for path reasons.

// #include "TTL_core.h"

// #include "TTL_import_export.h"

// #include TTL_IMPORT_EXPORT_INCLUDE_H


/**

 * @def The structs used for this buffering type

 */


/**

 * @brief Data required to perform duplex buffer pipelining.

 *

 * @see TTL_start_duplex_buffering for a description of duplex buffer

 * pipelining.

 */


typedef struct {

    struct {

        int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double

                 0->1->0->1... etc */

        __local uint *int_base[2];            /*!< The internal base addresses of the pipelined tiles. */

        TTL_ext_uint_tensor_t ext_tensor_in;  /*!< The external tensor being input */

        TTL_ext_uint_tensor_t ext_tensor_out; /*!< The external tensor being output */

    } common;                                 ///< The information that is common to all pipeline schemes


    TTL_event_t (*events)[2];  ///< 2 Events are required, 1 first is used for

                               ///< external to internal transfers, the second for

                               ///< internal to external transfers


    /**

     * @brief Store of the buffers used for the previous import/export cycles.

     *

     */

    struct {

        TTL_ext_uint_tensor_t to_export_to;

        TTL_const_int_uint_tensor_t to_export_from;

    } prev_out_tensors;

} TTL_duplex_const_uint_tensor_buffering_t;


/*

 * Predeclare TTL_step_buffering.

 */

static inline TTL_io_uint_tensor_t __attribute__((overloadable)) TTL_step_buffering(

    TTL_duplex_const_uint_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,

    TTL_tile_t tile_current_export);


/**

 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process

 *

 * @param ext_tensor_in A tensor describing the input in global memory

 * @param int_base_in The address of the local import buffer.

 * @param ext_tensor_out A tensor describing the output in global memory

 * @param int_base_out The address of the local export buffer.

 * @param events A pointer to a list of 2 events.

 * The first event in the list will be used for imports, the second event in

 * the list will be used for exports.

 * @param first_tile The first tile to fetch for the scheme

 *

 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.

 *

 * The first event in the list will be used for imports,

 * the second event in the list will be used for exports.

 * \n\n Example:

 * @code

 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};

 *

 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(

 *     ext_base_in, ext_layout_in, l_buffers[0],

 *     ext_base_out, ext_layout_out, l_buffers[1],

 *     &events);

 * @endcode

 * \n

 *

 * @return The TTL_duplex_buffering_t created from the input parameters.

 *

 * Solid description of duplex buffering here.

 *

 * The simplest form of duplex buffering takes the following flow.

 *

 * @startuml

 *

 * start

 *

 * :Create a TTL_tiler_t with TTL_create_tiler;

 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers

 * 1 input buffer, 1 output buffer;

 * :NumberOfTiles = TTL_number_of_tiles(tiler);

 *

 * while (for each tile)

 *

 *   :Import The Next Tile into the input buffer;

 *

 *   :Process the Tile from the input buffer to the output buffer;

 *

 *   :ExportThe Process Tile from into the output buffer;

 *

 * endwhile

 *

 * stop

 *

 * @enduml

 *

 * This can be optimized and standardized using the TTL_step_buffering

 * call.

 *

 * @startuml

 *

 * start

 *

 * :Create a TTL_tiler_t with TTL_create_tiler;

 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;

 * :NumberOfTiles = TTL_number_of_tiles(tiler);

 *

 * while (for each tile)

 *

 *   :Call TTL_step_buffering for the current tile

 *

 *    This will import the current new tile and export the last tile

 *    in parallel;

 *

 *   if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)

 *      :Process the Tile from the input buffer to the output buffer;

 *   endif

 *

 * endwhile

 *

 * stop

 *

 * @enduml

 */


static inline TTL_duplex_const_uint_tensor_buffering_t __attribute__((overloadable)) TTL_start_duplex_buffering(

    TTL_ext_uint_tensor_t ext_tensor_in, __local uint *int_base_in, TTL_ext_uint_tensor_t ext_tensor_out,

    __local uint *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {

    TTL_duplex_const_uint_tensor_buffering_t result;

    result.common.int_base[0] = int_base_in;

    result.common.int_base[1] = int_base_out;


    result.common.ext_tensor_in = ext_tensor_in;

    result.common.ext_tensor_out = ext_tensor_out;

    result.events = events;

    result.prev_out_tensors.to_export_to = TTL_create_empty_ext_tensor((__global uint *)0);

    result.prev_out_tensors.to_export_from = TTL_create_empty_const_int_tensor((__local uint *)0);


    TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());


    return result;

}


static inline TTL_io_uint_tensor_t __attribute__((overloadable)) TTL_step_buffering(

    TTL_duplex_const_uint_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,

    TTL_tile_t tile_current_export) {

    const TTL_layout_t next_import_layout =

        TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);

    const TTL_const_ext_uint_tensor_t next_import_ext_tensor =

        TTL_create_const_ext_tensor(duplex_buffering->common.ext_tensor_in.base,

                                    tile_current_import.shape,

                                    duplex_buffering->common.ext_tensor_in.layout,

                                    tile_current_import.offset,

                                    duplex_buffering->common.ext_tensor_in.elem_size);

    const TTL_int_uint_sub_tensor_t next_import_int_sub_tensor =

        TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],

                                  tile_current_import.shape,

                                  next_import_layout,

                                  *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),

                                  tile_current_import.offset);


    const TTL_const_int_uint_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;

    const TTL_ext_uint_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;


    if (TTL_tile_empty(tile_current_import) == false)

        TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),

                              *TTL_to_void_tensor(&next_import_ext_tensor),

                              &(*duplex_buffering->events)[0]);


    if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)

        TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),

                   *TTL_to_void_tensor(&next_export_ext_tensor),

                   &(*duplex_buffering->events)[1]);


    const TTL_layout_t int_export_layout =

        TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);

    const TTL_ext_uint_tensor_t to_export_to = TTL_create_ext_tensor(duplex_buffering->common.ext_tensor_out.base,

                                                                     tile_current_export.shape,

                                                                     duplex_buffering->common.ext_tensor_out.layout,

                                                                     tile_current_export.offset,

                                                                     duplex_buffering->common.ext_tensor_out.elem_size);

    const TTL_int_uint_sub_tensor_t to_export_from =

        TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],

                                  tile_current_export.shape,

                                  int_export_layout,

                                  *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),

                                  tile_current_export.offset);


    duplex_buffering->prev_out_tensors.to_export_to = to_export_to;

    duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);


    TTL_wait(2, *duplex_buffering->events);


    return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);

}


static inline void __attribute__((overloadable)) TTL_finish_buffering(

    TTL_duplex_const_uint_tensor_buffering_t *const duplex_buffering) {

    TTL_step_buffering(duplex_buffering, TTL_create_empty_tile(), TTL_create_empty_tile());

}


/*

 * TTL_duplex_scheme.h

 *

 * Copyright (c) 2023 Mobileye

 *

 * Licensed under the Apache License, Version 2.0 (the License);

 * you may not use this file except in compliance with the License.

 * You may obtain a copy of the License at

 *

 *     http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an AS IS BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


// clang-format off

/**

 * @file

 *

 * Given pair of blocking import and export that can execute concurrently,

 * TTL_duplex_buffering issues them together and then waits on both to complete,

 * hopefully executing them in parallel to each other. This scheme uses two

 * internal buffers, one for the import and one for the export. Note that the

 * export is pipelined to pair the import of the current tile with the export of

 * previous tile.


 * The following table draws the pipelined actions performed in duplex buffering.

 * It specifies which tile is processed in each iteration:

 *

 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |

 * |-------------------|-----|-----|----------------------|---------------|

 * | **Import**        | 0   | 1   | i                    |               |

 * | **Wait Import**   | 0   | 1   | i                    |               |

 * | **Compute**       | 0   | 1   | i                    |               |

 * | **Export**        |     | 0   | i-1                  | NumOfTiles-1  |

 * | **WaitExport**    |     | 0   | i-1                  | NumOfTiles-1  |

 *

 * Notice the epilog (\#NumOfTiles) which is an extra iteration.

 *

 * When including this file the following must be defined

 *

 * #define TTL_TENSOR_TYPE void

 * #define TTL_TENSOR_TYPE uchar

 * etc

 *

 * @example TTL_duplex_buffering.cl

 */

// clang-format on


// This file presumes that the following have been pre included.

// this is not done here for path reasons.

// #include "TTL_core.h"

// #include "TTL_import_export.h"

// #include TTL_IMPORT_EXPORT_INCLUDE_H


/**

 * @def The structs used for this buffering type

 */


/**

 * @brief Data required to perform duplex buffer pipelining.

 *

 * @see TTL_start_duplex_buffering for a description of duplex buffer

 * pipelining.

 */


typedef struct {

    struct {

        int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double

                 0->1->0->1... etc */

        __local short *int_base[2];            /*!< The internal base addresses of the pipelined tiles. */

        TTL_ext_short_tensor_t ext_tensor_in;  /*!< The external tensor being input */

        TTL_ext_short_tensor_t ext_tensor_out; /*!< The external tensor being output */

    } common;                                  ///< The information that is common to all pipeline schemes


    TTL_event_t (*events)[2];  ///< 2 Events are required, 1 first is used for

                               ///< external to internal transfers, the second for

                               ///< internal to external transfers


    /**

     * @brief Store of the buffers used for the previous import/export cycles.

     *

     */

    struct {

        TTL_ext_short_tensor_t to_export_to;

        TTL_const_int_short_tensor_t to_export_from;

    } prev_out_tensors;

} TTL_duplex_const_short_tensor_buffering_t;


/*

 * Predeclare TTL_step_buffering.

 */

static inline TTL_io_short_tensor_t __attribute__((overloadable)) TTL_step_buffering(

    TTL_duplex_const_short_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,

    TTL_tile_t tile_current_export);


/**

 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process

 *

 * @param ext_tensor_in A tensor describing the input in global memory

 * @param int_base_in The address of the local import buffer.

 * @param ext_tensor_out A tensor describing the output in global memory

 * @param int_base_out The address of the local export buffer.

 * @param events A pointer to a list of 2 events.

 * The first event in the list will be used for imports, the second event in

 * the list will be used for exports.

 * @param first_tile The first tile to fetch for the scheme

 *

 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.

 *

 * The first event in the list will be used for imports,

 * the second event in the list will be used for exports.

 * \n\n Example:

 * @code

 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};

 *

 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(

 *     ext_base_in, ext_layout_in, l_buffers[0],

 *     ext_base_out, ext_layout_out, l_buffers[1],

 *     &events);

 * @endcode

 * \n

 *

 * @return The TTL_duplex_buffering_t created from the input parameters.

 *

 * Solid description of duplex buffering here.

 *

 * The simplest form of duplex buffering takes the following flow.

 *

 * @startuml

 *

 * start

 *

 * :Create a TTL_tiler_t with TTL_create_tiler;

 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers

 * 1 input buffer, 1 output buffer;

 * :NumberOfTiles = TTL_number_of_tiles(tiler);

 *

 * while (for each tile)

 *

 *   :Import The Next Tile into the input buffer;

 *

 *   :Process the Tile from the input buffer to the output buffer;

 *

 *   :ExportThe Process Tile from into the output buffer;

 *

 * endwhile

 *

 * stop

 *

 * @enduml

 *

 * This can be optimized and standardized using the TTL_step_buffering

 * call.

 *

 * @startuml

 *

 * start

 *

 * :Create a TTL_tiler_t with TTL_create_tiler;

 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;

 * :NumberOfTiles = TTL_number_of_tiles(tiler);

 *

 * while (for each tile)

 *

 *   :Call TTL_step_buffering for the current tile

 *

 *    This will import the current new tile and export the last tile

 *    in parallel;

 *

 *   if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)

 *      :Process the Tile from the input buffer to the output buffer;

 *   endif

 *

 * endwhile

 *

 * stop

 *

 * @enduml

 */


static inline TTL_duplex_const_short_tensor_buffering_t __attribute__((overloadable)) TTL_start_duplex_buffering(

    TTL_ext_short_tensor_t ext_tensor_in, __local short *int_base_in, TTL_ext_short_tensor_t ext_tensor_out,

    __local short *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {

    TTL_duplex_const_short_tensor_buffering_t result;

    result.common.int_base[0] = int_base_in;

    result.common.int_base[1] = int_base_out;


    result.common.ext_tensor_in = ext_tensor_in;

    result.common.ext_tensor_out = ext_tensor_out;

    result.events = events;

    result.prev_out_tensors.to_export_to = TTL_create_empty_ext_tensor((__global short *)0);

    result.prev_out_tensors.to_export_from = TTL_create_empty_const_int_tensor((__local short *)0);


    TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());


    return result;

}


static inline TTL_io_short_tensor_t __attribute__((overloadable)) TTL_step_buffering(

    TTL_duplex_const_short_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,

    TTL_tile_t tile_current_export) {

    const TTL_layout_t next_import_layout =

        TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);

    const TTL_const_ext_short_tensor_t next_import_ext_tensor =

        TTL_create_const_ext_tensor(duplex_buffering->common.ext_tensor_in.base,

                                    tile_current_import.shape,

                                    duplex_buffering->common.ext_tensor_in.layout,

                                    tile_current_import.offset,

                                    duplex_buffering->common.ext_tensor_in.elem_size);

    const TTL_int_short_sub_tensor_t next_import_int_sub_tensor =

        TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],

                                  tile_current_import.shape,

                                  next_import_layout,

                                  *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),

                                  tile_current_import.offset);


    const TTL_const_int_short_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;

    const TTL_ext_short_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;


    if (TTL_tile_empty(tile_current_import) == false)

        TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),

                              *TTL_to_void_tensor(&next_import_ext_tensor),

                              &(*duplex_buffering->events)[0]);


    if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)

        TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),

                   *TTL_to_void_tensor(&next_export_ext_tensor),

                   &(*duplex_buffering->events)[1]);


    const TTL_layout_t int_export_layout =

        TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);

    const TTL_ext_short_tensor_t to_export_to =

        TTL_create_ext_tensor(duplex_buffering->common.ext_tensor_out.base,

                              tile_current_export.shape,

                              duplex_buffering->common.ext_tensor_out.layout,

                              tile_current_export.offset,

                              duplex_buffering->common.ext_tensor_out.elem_size);

    const TTL_int_short_sub_tensor_t to_export_from =

        TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],

                                  tile_current_export.shape,

                                  int_export_layout,

                                  *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),

                                  tile_current_export.offset);


    duplex_buffering->prev_out_tensors.to_export_to = to_export_to;

    duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);


    TTL_wait(2, *duplex_buffering->events);


    return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);

}


static inline void __attribute__((overloadable)) TTL_finish_buffering(

    TTL_duplex_const_short_tensor_buffering_t *const duplex_buffering) {

    TTL_step_buffering(duplex_buffering, TTL_create_empty_tile(), TTL_create_empty_tile());

}


/*

 * TTL_duplex_scheme.h

 *

 * Copyright (c) 2023 Mobileye

 *

 * Licensed under the Apache License, Version 2.0 (the License);

 * you may not use this file except in compliance with the License.

 * You may obtain a copy of the License at

 *

 *     http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an AS IS BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


// clang-format off

/**

 * @file

 *

 * Given pair of blocking import and export that can execute concurrently,

 * TTL_duplex_buffering issues them together and then waits on both to complete,

 * hopefully executing them in parallel to each other. This scheme uses two

 * internal buffers, one for the import and one for the export. Note that the

 * export is pipelined to pair the import of the current tile with the export of

 * previous tile.


 * The following table draws the pipelined actions performed in duplex buffering.

 * It specifies which tile is processed in each iteration:

 *

 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |

 * |-------------------|-----|-----|----------------------|---------------|

 * | **Import**        | 0   | 1   | i                    |               |

 * | **Wait Import**   | 0   | 1   | i                    |               |

 * | **Compute**       | 0   | 1   | i                    |               |

 * | **Export**        |     | 0   | i-1                  | NumOfTiles-1  |

 * | **WaitExport**    |     | 0   | i-1                  | NumOfTiles-1  |

 *

 * Notice the epilog (\#NumOfTiles) which is an extra iteration.

 *

 * When including this file the following must be defined

 *

 * #define TTL_TENSOR_TYPE void

 * #define TTL_TENSOR_TYPE uchar

 * etc

 *

 * @example TTL_duplex_buffering.cl

 */

// clang-format on


// This file presumes that the following have been pre included.

// this is not done here for path reasons.

// #include "TTL_core.h"

// #include "TTL_import_export.h"

// #include TTL_IMPORT_EXPORT_INCLUDE_H


/**

 * @def The structs used for this buffering type

 */


/**

 * @brief Data required to perform duplex buffer pipelining.

 *

 * @see TTL_start_duplex_buffering for a description of duplex buffer

 * pipelining.

 */


typedef struct {

    struct {

        int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double

                 0->1->0->1... etc */

        __local ushort *int_base[2];            /*!< The internal base addresses of the pipelined tiles. */

        TTL_ext_ushort_tensor_t ext_tensor_in;  /*!< The external tensor being input */

        TTL_ext_ushort_tensor_t ext_tensor_out; /*!< The external tensor being output */

    } common;                                   ///< The information that is common to all pipeline schemes


    TTL_event_t (*events)[2];  ///< 2 Events are required, 1 first is used for

                               ///< external to internal transfers, the second for

                               ///< internal to external transfers


    /**

     * @brief Store of the buffers used for the previous import/export cycles.

     *

     */

    struct {

        TTL_ext_ushort_tensor_t to_export_to;

        TTL_const_int_ushort_tensor_t to_export_from;

    } prev_out_tensors;

} TTL_duplex_const_ushort_tensor_buffering_t;


/*

 * Predeclare TTL_step_buffering.

 */

static inline TTL_io_ushort_tensor_t __attribute__((overloadable)) TTL_step_buffering(

    TTL_duplex_const_ushort_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,

    TTL_tile_t tile_current_export);


/**

 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process

 *

 * @param ext_tensor_in A tensor describing the input in global memory

 * @param int_base_in The address of the local import buffer.

 * @param ext_tensor_out A tensor describing the output in global memory

 * @param int_base_out The address of the local export buffer.

 * @param events A pointer to a list of 2 events.

 * The first event in the list will be used for imports, the second event in

 * the list will be used for exports.

 * @param first_tile The first tile to fetch for the scheme

 *

 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.

 *

 * The first event in the list will be used for imports,

 * the second event in the list will be used for exports.

 * \n\n Example:

 * @code

 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};

 *

 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(

 *     ext_base_in, ext_layout_in, l_buffers[0],

 *     ext_base_out, ext_layout_out, l_buffers[1],

 *     &events);

 * @endcode

 * \n

 *

 * @return The TTL_duplex_buffering_t created from the input parameters.

 *

 * Solid description of duplex buffering here.

 *

 * The simplest form of duplex buffering takes the following flow.

 *

 * @startuml

 *

 * start

 *

 * :Create a TTL_tiler_t with TTL_create_tiler;

 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers

 * 1 input buffer, 1 output buffer;

 * :NumberOfTiles = TTL_number_of_tiles(tiler);

 *

 * while (for each tile)

 *

 *   :Import The Next Tile into the input buffer;

 *

 *   :Process the Tile from the input buffer to the output buffer;

 *

 *   :ExportThe Process Tile from into the output buffer;

 *

 * endwhile

 *

 * stop

 *

 * @enduml

 *

 * This can be optimized and standardized using the TTL_step_buffering

 * call.

 *

 * @startuml

 *

 * start

 *

 * :Create a TTL_tiler_t with TTL_create_tiler;

 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;

 * :NumberOfTiles = TTL_number_of_tiles(tiler);

 *

 * while (for each tile)

 *

 *   :Call TTL_step_buffering for the current tile

 *

 *    This will import the current new tile and export the last tile

 *    in parallel;

 *

 *   if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)

 *      :Process the Tile from the input buffer to the output buffer;

 *   endif

 *

 * endwhile

 *

 * stop

 *

 * @enduml

 */


static inline TTL_duplex_const_ushort_tensor_buffering_t __attribute__((overloadable)) TTL_start_duplex_buffering(

    TTL_ext_ushort_tensor_t ext_tensor_in, __local ushort *int_base_in, TTL_ext_ushort_tensor_t ext_tensor_out,

    __local ushort *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {

    TTL_duplex_const_ushort_tensor_buffering_t result;

    result.common.int_base[0] = int_base_in;

    result.common.int_base[1] = int_base_out;


    result.common.ext_tensor_in = ext_tensor_in;

    result.common.ext_tensor_out = ext_tensor_out;

    result.events = events;

    result.prev_out_tensors.to_export_to = TTL_create_empty_ext_tensor((__global ushort *)0);

    result.prev_out_tensors.to_export_from = TTL_create_empty_const_int_tensor((__local ushort *)0);


    TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());


    return result;

}


static inline TTL_io_ushort_tensor_t __attribute__((overloadable)) TTL_step_buffering(

    TTL_duplex_const_ushort_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,

    TTL_tile_t tile_current_export) {

    const TTL_layout_t next_import_layout =

        TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);

    const TTL_const_ext_ushort_tensor_t next_import_ext_tensor =

        TTL_create_const_ext_tensor(duplex_buffering->common.ext_tensor_in.base,

                                    tile_current_import.shape,

                                    duplex_buffering->common.ext_tensor_in.layout,

                                    tile_current_import.offset,

                                    duplex_buffering->common.ext_tensor_in.elem_size);

    const TTL_int_ushort_sub_tensor_t next_import_int_sub_tensor =

        TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],

                                  tile_current_import.shape,

                                  next_import_layout,

                                  *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),

                                  tile_current_import.offset);


    const TTL_const_int_ushort_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;

    const TTL_ext_ushort_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;


    if (TTL_tile_empty(tile_current_import) == false)

        TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),

                              *TTL_to_void_tensor(&next_import_ext_tensor),

                              &(*duplex_buffering->events)[0]);


    if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)

        TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),

                   *TTL_to_void_tensor(&next_export_ext_tensor),

                   &(*duplex_buffering->events)[1]);


    const TTL_layout_t int_export_layout =

        TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);

    const TTL_ext_ushort_tensor_t to_export_to =

        TTL_create_ext_tensor(duplex_buffering->common.ext_tensor_out.base,

                              tile_current_export.shape,

                              duplex_buffering->common.ext_tensor_out.layout,

                              tile_current_export.offset,

                              duplex_buffering->common.ext_tensor_out.elem_size);

    const TTL_int_ushort_sub_tensor_t to_export_from =

        TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],

                                  tile_current_export.shape,

                                  int_export_layout,

                                  *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),

                                  tile_current_export.offset);


    duplex_buffering->prev_out_tensors.to_export_to = to_export_to;

    duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);


    TTL_wait(2, *duplex_buffering->events);


    return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);

}


static inline void __attribute__((overloadable)) TTL_finish_buffering(

    TTL_duplex_const_ushort_tensor_buffering_t *const duplex_buffering) {

    TTL_step_buffering(duplex_buffering, TTL_create_empty_tile(), TTL_create_empty_tile());

}


/*

 * TTL_duplex_scheme.h

 *

 * Copyright (c) 2023 Mobileye

 *

 * Licensed under the Apache License, Version 2.0 (the License);

 * you may not use this file except in compliance with the License.

 * You may obtain a copy of the License at

 *

 *     http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an AS IS BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


// clang-format off

/**

 * @file

 *

 * Given pair of blocking import and export that can execute concurrently,

 * TTL_duplex_buffering issues them together and then waits on both to complete,

 * hopefully executing them in parallel to each other. This scheme uses two

 * internal buffers, one for the import and one for the export. Note that the

 * export is pipelined to pair the import of the current tile with the export of

 * previous tile.


 * The following table draws the pipelined actions performed in duplex buffering.

 * It specifies which tile is processed in each iteration:

 *

 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |

 * |-------------------|-----|-----|----------------------|---------------|

 * | **Import**        | 0   | 1   | i                    |               |

 * | **Wait Import**   | 0   | 1   | i                    |               |

 * | **Compute**       | 0   | 1   | i                    |               |

 * | **Export**        |     | 0   | i-1                  | NumOfTiles-1  |

 * | **WaitExport**    |     | 0   | i-1                  | NumOfTiles-1  |

 *

 * Notice the epilog (\#NumOfTiles) which is an extra iteration.

 *

 * When including this file the following must be defined

 *

 * #define TTL_TENSOR_TYPE void

 * #define TTL_TENSOR_TYPE uchar

 * etc

 *

 * @example TTL_duplex_buffering.cl

 */

// clang-format on


// This file presumes that the following have been pre included.

// this is not done here for path reasons.

// #include "TTL_core.h"

// #include "TTL_import_export.h"

// #include TTL_IMPORT_EXPORT_INCLUDE_H


/**

 * @def The structs used for this buffering type

 */


/**

 * @brief Data required to perform duplex buffer pipelining.

 *

 * @see TTL_start_duplex_buffering for a description of duplex buffer

 * pipelining.

 */


typedef struct {

    struct {

        int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double

                 0->1->0->1... etc */

        __local long *int_base[2];            /*!< The internal base addresses of the pipelined tiles. */

        TTL_ext_long_tensor_t ext_tensor_in;  /*!< The external tensor being input */

        TTL_ext_long_tensor_t ext_tensor_out; /*!< The external tensor being output */

    } common;                                 ///< The information that is common to all pipeline schemes


    TTL_event_t (*events)[2];  ///< 2 Events are required, 1 first is used for

                               ///< external to internal transfers, the second for

                               ///< internal to external transfers


    /**

     * @brief Store of the buffers used for the previous import/export cycles.

     *

     */

    struct {

        TTL_ext_long_tensor_t to_export_to;

        TTL_const_int_long_tensor_t to_export_from;

    } prev_out_tensors;

} TTL_duplex_const_long_tensor_buffering_t;


/*

 * Predeclare TTL_step_buffering.

 */

static inline TTL_io_long_tensor_t __attribute__((overloadable)) TTL_step_buffering(

    TTL_duplex_const_long_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,

    TTL_tile_t tile_current_export);


/**

 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process

 *

 * @param ext_tensor_in A tensor describing the input in global memory

 * @param int_base_in The address of the local import buffer.

 * @param ext_tensor_out A tensor describing the output in global memory

 * @param int_base_out The address of the local export buffer.

 * @param events A pointer to a list of 2 events.

 * The first event in the list will be used for imports, the second event in

 * the list will be used for exports.

 * @param first_tile The first tile to fetch for the scheme

 *

 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.

 *

 * The first event in the list will be used for imports,

 * the second event in the list will be used for exports.

 * \n\n Example:

 * @code

 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};

 *

 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(

 *     ext_base_in, ext_layout_in, l_buffers[0],

 *     ext_base_out, ext_layout_out, l_buffers[1],

 *     &events);

 * @endcode

 * \n

 *

 * @return The TTL_duplex_buffering_t created from the input parameters.

 *

 * Solid description of duplex buffering here.

 *

 * The simplest form of duplex buffering takes the following flow.

 *

 * @startuml

 *

 * start

 *

 * :Create a TTL_tiler_t with TTL_create_tiler;

 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers

 * 1 input buffer, 1 output buffer;

 * :NumberOfTiles = TTL_number_of_tiles(tiler);

 *

 * while (for each tile)

 *

 *   :Import The Next Tile into the input buffer;

 *

 *   :Process the Tile from the input buffer to the output buffer;

 *

 *   :ExportThe Process Tile from into the output buffer;

 *

 * endwhile

 *

 * stop

 *

 * @enduml

 *

 * This can be optimized and standardized using the TTL_step_buffering

 * call.

 *

 * @startuml

 *

 * start

 *

 * :Create a TTL_tiler_t with TTL_create_tiler;

 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;

 * :NumberOfTiles = TTL_number_of_tiles(tiler);

 *

 * while (for each tile)

 *

 *   :Call TTL_step_buffering for the current tile

 *

 *    This will import the current new tile and export the last tile

 *    in parallel;

 *

 *   if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)

 *      :Process the Tile from the input buffer to the output buffer;

 *   endif

 *

 * endwhile

 *

 * stop

 *

 * @enduml

 */


static inline TTL_duplex_const_long_tensor_buffering_t __attribute__((overloadable)) TTL_start_duplex_buffering(

    TTL_ext_long_tensor_t ext_tensor_in, __local long *int_base_in, TTL_ext_long_tensor_t ext_tensor_out,

    __local long *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {

    TTL_duplex_const_long_tensor_buffering_t result;

    result.common.int_base[0] = int_base_in;

    result.common.int_base[1] = int_base_out;


    result.common.ext_tensor_in = ext_tensor_in;

    result.common.ext_tensor_out = ext_tensor_out;

    result.events = events;

    result.prev_out_tensors.to_export_to = TTL_create_empty_ext_tensor((__global long *)0);

    result.prev_out_tensors.to_export_from = TTL_create_empty_const_int_tensor((__local long *)0);


    TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());


    return result;

}


static inline TTL_io_long_tensor_t __attribute__((overloadable)) TTL_step_buffering(

    TTL_duplex_const_long_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,

    TTL_tile_t tile_current_export) {

    const TTL_layout_t next_import_layout =

        TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);

    const TTL_const_ext_long_tensor_t next_import_ext_tensor =

        TTL_create_const_ext_tensor(duplex_buffering->common.ext_tensor_in.base,

                                    tile_current_import.shape,

                                    duplex_buffering->common.ext_tensor_in.layout,

                                    tile_current_import.offset,

                                    duplex_buffering->common.ext_tensor_in.elem_size);

    const TTL_int_long_sub_tensor_t next_import_int_sub_tensor =

        TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],

                                  tile_current_import.shape,

                                  next_import_layout,

                                  *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),

                                  tile_current_import.offset);


    const TTL_const_int_long_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;

    const TTL_ext_long_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;


    if (TTL_tile_empty(tile_current_import) == false)

        TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),

                              *TTL_to_void_tensor(&next_import_ext_tensor),

                              &(*duplex_buffering->events)[0]);


    if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)

        TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),

                   *TTL_to_void_tensor(&next_export_ext_tensor),

                   &(*duplex_buffering->events)[1]);


    const TTL_layout_t int_export_layout =

        TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);

    const TTL_ext_long_tensor_t to_export_to = TTL_create_ext_tensor(duplex_buffering->common.ext_tensor_out.base,

                                                                     tile_current_export.shape,

                                                                     duplex_buffering->common.ext_tensor_out.layout,

                                                                     tile_current_export.offset,

                                                                     duplex_buffering->common.ext_tensor_out.elem_size);

    const TTL_int_long_sub_tensor_t to_export_from =

        TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],

                                  tile_current_export.shape,

                                  int_export_layout,

                                  *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),

                                  tile_current_export.offset);


    duplex_buffering->prev_out_tensors.to_export_to = to_export_to;

    duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);


    TTL_wait(2, *duplex_buffering->events);


    return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);

}


static inline void __attribute__((overloadable)) TTL_finish_buffering(

    TTL_duplex_const_long_tensor_buffering_t *const duplex_buffering) {

    TTL_step_buffering(duplex_buffering, TTL_create_empty_tile(), TTL_create_empty_tile());

}


/*

 * TTL_duplex_scheme.h

 *

 * Copyright (c) 2023 Mobileye

 *

 * Licensed under the Apache License, Version 2.0 (the License);

 * you may not use this file except in compliance with the License.

 * You may obtain a copy of the License at

 *

 *     http://www.apache.org/licenses/LICENSE-2.0

 *

 * Unless required by applicable law or agreed to in writing, software

 * distributed under the License is distributed on an AS IS BASIS,

 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.

 * See the License for the specific language governing permissions and

 * limitations under the License.

 */


// clang-format off

/**

 * @file

 *

 * Given pair of blocking import and export that can execute concurrently,

 * TTL_duplex_buffering issues them together and then waits on both to complete,

 * hopefully executing them in parallel to each other. This scheme uses two

 * internal buffers, one for the import and one for the export. Note that the

 * export is pipelined to pair the import of the current tile with the export of

 * previous tile.


 * The following table draws the pipelined actions performed in duplex buffering.

 * It specifies which tile is processed in each iteration:

 *

 * | Action\\Iteration | \#0 | \#1 | \#i (2:NumOfTiles-1) | \#NumOfTiles- |

 * |-------------------|-----|-----|----------------------|---------------|

 * | **Import**        | 0   | 1   | i                    |               |

 * | **Wait Import**   | 0   | 1   | i                    |               |

 * | **Compute**       | 0   | 1   | i                    |               |

 * | **Export**        |     | 0   | i-1                  | NumOfTiles-1  |

 * | **WaitExport**    |     | 0   | i-1                  | NumOfTiles-1  |

 *

 * Notice the epilog (\#NumOfTiles) which is an extra iteration.

 *

 * When including this file the following must be defined

 *

 * #define TTL_TENSOR_TYPE void

 * #define TTL_TENSOR_TYPE uchar

 * etc

 *

 * @example TTL_duplex_buffering.cl

 */

// clang-format on


// This file presumes that the following have been pre included.

// this is not done here for path reasons.

// #include "TTL_core.h"

// #include "TTL_import_export.h"

// #include TTL_IMPORT_EXPORT_INCLUDE_H


/**

 * @def The structs used for this buffering type

 */


/**

 * @brief Data required to perform duplex buffer pipelining.

 *

 * @see TTL_start_duplex_buffering for a description of duplex buffer

 * pipelining.

 */


typedef struct {

    struct {

        int index; /*!< Describes the current buffer index when pipelining. For single 0->1->0, for double

                 0->1->0->1... etc */

        __local ulong *int_base[2];            /*!< The internal base addresses of the pipelined tiles. */

        TTL_ext_ulong_tensor_t ext_tensor_in;  /*!< The external tensor being input */

        TTL_ext_ulong_tensor_t ext_tensor_out; /*!< The external tensor being output */

    } common;                                  ///< The information that is common to all pipeline schemes


    TTL_event_t (*events)[2];  ///< 2 Events are required, 1 first is used for

                               ///< external to internal transfers, the second for

                               ///< internal to external transfers


    /**

     * @brief Store of the buffers used for the previous import/export cycles.

     *

     */

    struct {

        TTL_ext_ulong_tensor_t to_export_to;

        TTL_const_int_ulong_tensor_t to_export_from;

    } prev_out_tensors;

} TTL_duplex_const_ulong_tensor_buffering_t;


/*

 * Predeclare TTL_step_buffering.

 */

static inline TTL_io_ulong_tensor_t __attribute__((overloadable)) TTL_step_buffering(

    TTL_duplex_const_ulong_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import,

    TTL_tile_t tile_current_export);


/**

 * @brief Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process

 *

 * @param ext_tensor_in A tensor describing the input in global memory

 * @param int_base_in The address of the local import buffer.

 * @param ext_tensor_out A tensor describing the output in global memory

 * @param int_base_out The address of the local export buffer.

 * @param events A pointer to a list of 2 events.

 * The first event in the list will be used for imports, the second event in

 * the list will be used for exports.

 * @param first_tile The first tile to fetch for the scheme

 *

 * @return The TTL_DUPLEX_BUFFERING_TYPE created from the input parameters.

 *

 * The first event in the list will be used for imports,

 * the second event in the list will be used for exports.

 * \n\n Example:

 * @code

 * TTL_event_t events[2] = { TTL_get_event(), TTL_get_event()};

 *

 * TTL_duplex_buffering_t buffering_scheme = TTL_start_duplex_buffering(

 *     ext_base_in, ext_layout_in, l_buffers[0],

 *     ext_base_out, ext_layout_out, l_buffers[1],

 *     &events);

 * @endcode

 * \n

 *

 * @return The TTL_duplex_buffering_t created from the input parameters.

 *

 * Solid description of duplex buffering here.

 *

 * The simplest form of duplex buffering takes the following flow.

 *

 * @startuml

 *

 * start

 *

 * :Create a TTL_tiler_t with TTL_create_tiler;

 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers

 * 1 input buffer, 1 output buffer;

 * :NumberOfTiles = TTL_number_of_tiles(tiler);

 *

 * while (for each tile)

 *

 *   :Import The Next Tile into the input buffer;

 *

 *   :Process the Tile from the input buffer to the output buffer;

 *

 *   :ExportThe Process Tile from into the output buffer;

 *

 * endwhile

 *

 * stop

 *

 * @enduml

 *

 * This can be optimized and standardized using the TTL_step_buffering

 * call.

 *

 * @startuml

 *

 * start

 *

 * :Create a TTL_tiler_t with TTL_create_tiler;

 * :Create a TTL_duplex_buffering_t Structure with 2 Buffers 1 input buffer, 1 output buffer;

 * :NumberOfTiles = TTL_number_of_tiles(tiler);

 *

 * while (for each tile)

 *

 *   :Call TTL_step_buffering for the current tile

 *

 *    This will import the current new tile and export the last tile

 *    in parallel;

 *

 *   if (Does the input buffer contain a valid tile? **TTL_tile_empty(...)**) then (yes)

 *      :Process the Tile from the input buffer to the output buffer;

 *   endif

 *

 * endwhile

 *

 * stop

 *

 * @enduml

 */


static inline TTL_duplex_const_ulong_tensor_buffering_t __attribute__((overloadable)) TTL_start_duplex_buffering(

    TTL_ext_ulong_tensor_t ext_tensor_in, __local ulong *int_base_in, TTL_ext_ulong_tensor_t ext_tensor_out,

    __local ulong *int_base_out, TTL_event_t (*events)[2], TTL_tile_t first_tile) {

    TTL_duplex_const_ulong_tensor_buffering_t result;

    result.common.int_base[0] = int_base_in;

    result.common.int_base[1] = int_base_out;


    result.common.ext_tensor_in = ext_tensor_in;

    result.common.ext_tensor_out = ext_tensor_out;

    result.events = events;

    result.prev_out_tensors.to_export_to = TTL_create_empty_ext_tensor((__global ulong *)0);

    result.prev_out_tensors.to_export_from = TTL_create_empty_const_int_tensor((__local ulong *)0);


    TTL_step_buffering(&result, first_tile, TTL_create_empty_tile());


    return result;

}


static inline TTL_io_ulong_tensor_t __attribute__((overloadable)) TTL_step_buffering(

    TTL_duplex_const_ulong_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_current_import,

    TTL_tile_t tile_current_export) {

    const TTL_layout_t next_import_layout =

        TTL_create_layout(tile_current_import.shape.width, tile_current_import.shape.height);

    const TTL_const_ext_ulong_tensor_t next_import_ext_tensor =

        TTL_create_const_ext_tensor(duplex_buffering->common.ext_tensor_in.base,

                                    tile_current_import.shape,

                                    duplex_buffering->common.ext_tensor_in.layout,

                                    tile_current_import.offset,

                                    duplex_buffering->common.ext_tensor_in.elem_size);

    const TTL_int_ulong_sub_tensor_t next_import_int_sub_tensor =

        TTL_create_int_sub_tensor(duplex_buffering->common.int_base[0],

                                  tile_current_import.shape,

                                  next_import_layout,

                                  *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),

                                  tile_current_import.offset);


    const TTL_const_int_ulong_tensor_t next_export_int_tensor = duplex_buffering->prev_out_tensors.to_export_from;

    const TTL_ext_ulong_tensor_t next_export_ext_tensor = duplex_buffering->prev_out_tensors.to_export_to;


    if (TTL_tile_empty(tile_current_import) == false)

        TTL_import_sub_tensor(*TTL_to_void_sub_tensor(&next_import_int_sub_tensor),

                              *TTL_to_void_tensor(&next_import_ext_tensor),

                              &(*duplex_buffering->events)[0]);


    if (TTL_const_int_tensor_empty(duplex_buffering->prev_out_tensors.to_export_from) == false)

        TTL_export(*TTL_to_void_tensor(&next_export_int_tensor),

                   *TTL_to_void_tensor(&next_export_ext_tensor),

                   &(*duplex_buffering->events)[1]);


    const TTL_layout_t int_export_layout =

        TTL_create_layout(tile_current_export.shape.width, tile_current_export.shape.height);

    const TTL_ext_ulong_tensor_t to_export_to =

        TTL_create_ext_tensor(duplex_buffering->common.ext_tensor_out.base,

                              tile_current_export.shape,

                              duplex_buffering->common.ext_tensor_out.layout,

                              tile_current_export.offset,

                              duplex_buffering->common.ext_tensor_out.elem_size);

    const TTL_int_ulong_sub_tensor_t to_export_from =

        TTL_create_int_sub_tensor(duplex_buffering->common.int_base[1],

                                  tile_current_export.shape,

                                  int_export_layout,

                                  *TTL_to_const_tensor(&duplex_buffering->common.ext_tensor_in),

                                  tile_current_export.offset);


    duplex_buffering->prev_out_tensors.to_export_to = to_export_to;

    duplex_buffering->prev_out_tensors.to_export_from = *TTL_to_const_tensor(&to_export_from.tensor);


    TTL_wait(2, *duplex_buffering->events);


    return TTL_create_io_tensors(next_import_int_sub_tensor, to_export_from);

}


static inline void __attribute__((overloadable)) TTL_finish_buffering(

    TTL_duplex_const_ulong_tensor_buffering_t *const duplex_buffering) {

    TTL_step_buffering(duplex_buffering, TTL_create_empty_tile(), TTL_create_empty_tile());

}


TTL_step_buffering
static TTL_io_void_tensor_t TTL_step_buffering(TTL_duplex_const_void_tensor_buffering_t *const duplex_buffering, TTL_tile_t tile_next_import, TTL_tile_t tile_current_export)
Definition TTL_duplex_scheme.h:201

TTL_finish_buffering
static void TTL_finish_buffering(TTL_duplex_const_void_tensor_buffering_t *const duplex_buffering)
Definition TTL_duplex_scheme.h:254

TTL_start_duplex_buffering
static TTL_duplex_const_void_tensor_buffering_t TTL_start_duplex_buffering(TTL_ext_void_tensor_t ext_tensor_in, __local void *int_base_in, TTL_ext_void_tensor_t ext_tensor_out, __local void *int_base_out, TTL_event_t(*events)[2], TTL_tile_t first_tile)
Create a TTL_DUPLEX_BUFFERING_TYPE and begin the buffering process.
Definition TTL_duplex_scheme.h:183

TTL_create_empty_ext_tensor
static TTL_ext_void_tensor_t TTL_create_empty_ext_tensor(__global void *unused)
Definition TTL_int_ext_typed_tensors.h:157

TTL_create_ext_tensor
static TTL_ext_void_tensor_t TTL_create_ext_tensor(__global void *base, const TTL_shape_t shape, const TTL_layout_t layout, const TTL_offset_t offset, const TTL_dim_t elem_size)
const and non-const tensor creation functions.
Definition TTL_int_ext_typed_tensors.h:76

TTL_create_empty_const_int_tensor
static TTL_const_int_void_tensor_t TTL_create_empty_const_int_tensor(__local void *unused)
Definition TTL_int_ext_typed_tensors.h:7488

TTL_to_const_tensor
static const TTL_const_ext_void_tensor_t * TTL_to_const_tensor(const TTL_ext_void_tensor_t *const tensor)
Definition TTL_int_ext_typed_tensors.h:101

TTL_create_int_sub_tensor
static TTL_int_void_sub_tensor_t TTL_create_int_sub_tensor(__local void *base, const TTL_shape_t shape, const TTL_layout_t layout, const TTL_dim_t elem_size, const TTL_offset_t offset, const TTL_shape_t origin_shape, TTL_offset_t origin_offset)
const and non-const sub tensor creation functions.
Definition TTL_int_ext_typed_tensors.h:7714

TTL_const_int_tensor_empty
static bool TTL_const_int_tensor_empty(TTL_const_int_void_tensor_t tensor)
Definition TTL_int_ext_typed_tensors.h:7464

TTL_to_void_tensor
static const TTL_ext_void_tensor_t * TTL_to_void_tensor(const TTL_ext_void_tensor_t *tensor)
Definition TTL_int_ext_typed_tensors.h:120

TTL_to_void_sub_tensor
static const TTL_ext_void_sub_tensor_t * TTL_to_void_sub_tensor(const TTL_ext_void_sub_tensor_t *tensor)
Definition TTL_int_ext_typed_tensors.h:518

TTL_create_const_ext_tensor
static TTL_const_ext_void_tensor_t TTL_create_const_ext_tensor(__global const void *base, const TTL_shape_t shape, const TTL_layout_t layout, const TTL_offset_t offset, const TTL_dim_t elem_size)
create TTL_create_int_tensor_impl
Definition TTL_int_ext_typed_tensors.h:173

TTL_create_io_tensors
static TTL_io_void_tensor_t TTL_create_io_tensors(TTL_int_void_sub_tensor_t imported_to, TTL_int_void_sub_tensor_t to_export_from)
Create a TTL_io_tensors_t from a pair of tensors.
Definition TTL_schemes_common.h:78

TTL_create_layout
static TTL_layout_t TTL_create_layout(void)
Create a 1D Description of a Tensor layout in memory.
Definition TTL_tensors_common.h:96

TTL_tile_empty
static int TTL_tile_empty(TTL_tile_t tile)
Check if the tile passed is empty.
Definition TTL_tiles.h:257

TTL_create_empty_tile
static TTL_tile_t TTL_create_empty_tile()
Create an empty tile. Empty means it has all dimensions set to zero.
Definition TTL_tiles.h:267

TTL_import_sub_tensor
static void TTL_import_sub_tensor(const TTL_int_void_sub_tensor_t internal_sub_tensor, const TTL_const_ext_void_tensor_t const_external_tensor, TTL_event_t *event)
Implementation of TTL_import_sub_tensor.
Definition TTL_typed_import_export.h:86

TTL_export
static void TTL_export(const TTL_const_int_void_tensor_t internal_tensor, const TTL_ext_void_tensor_t external_tensor, TTL_event_t *event)
Export the external tensor to the internal tensor returning when complete.
Definition TTL_typed_import_export.h:115

TTL_event_t
event_t TTL_event_t
TTL_event_t is a pseudonym for OpenCL event_t.
Definition c/TTL_import_export.h:33

__global
#define __global
The opencl __global namespace is not supported in C.
Definition c/TTL_types.h:26

__local
#define __local
The opencl __local namespace is not supported in C.
Definition c/TTL_types.h:27

uchar
unsigned char uchar
opencl and so TTL supports a type called uchar which is not part of C
Definition c/TTL_types.h:25

ulong
unsigned long ulong
OpenCL supports ulong so provide the same in c.
Definition c/TTL_types.h:32

uint
unsigned int uint
OpenCL supports uint so provide the same in c.
Definition c/TTL_types.h:30

ushort
unsigned short ushort
OpenCL supports ushort so provide the same in c.
Definition c/TTL_types.h:31

TTL_wait
static void TTL_wait(const int num_events, TTL_event_t *const events)
Definition opencl/TTL_import_export.h:41

TTL_const_ext_char_tensor_t
Definition TTL_int_ext_typed_tensors.h:853

TTL_const_ext_int_tensor_t
Definition TTL_int_ext_typed_tensors.h:2461

TTL_const_ext_long_tensor_t
Definition TTL_int_ext_typed_tensors.h:5677

TTL_const_ext_short_tensor_t
Definition TTL_int_ext_typed_tensors.h:4068

TTL_const_ext_uchar_tensor_t
Definition TTL_int_ext_typed_tensors.h:1657

TTL_const_ext_uint_tensor_t
Definition TTL_int_ext_typed_tensors.h:3264

TTL_const_ext_ulong_tensor_t
Definition TTL_int_ext_typed_tensors.h:6481

TTL_const_ext_ushort_tensor_t
Definition TTL_int_ext_typed_tensors.h:4872

TTL_const_ext_void_tensor_t
Definition TTL_int_ext_typed_tensors.h:49

TTL_const_int_char_tensor_t
Definition TTL_int_ext_typed_tensors.h:8089

TTL_const_int_int_tensor_t
Definition TTL_int_ext_typed_tensors.h:9697

TTL_const_int_long_tensor_t
Definition TTL_int_ext_typed_tensors.h:12913

TTL_const_int_short_tensor_t
Definition TTL_int_ext_typed_tensors.h:11304

TTL_const_int_uchar_tensor_t
Definition TTL_int_ext_typed_tensors.h:8893

TTL_const_int_uint_tensor_t
Definition TTL_int_ext_typed_tensors.h:10500

TTL_const_int_ulong_tensor_t
Definition TTL_int_ext_typed_tensors.h:13717

TTL_const_int_ushort_tensor_t
Definition TTL_int_ext_typed_tensors.h:12108

TTL_const_int_void_tensor_t
Definition TTL_int_ext_typed_tensors.h:7285

TTL_duplex_const_char_tensor_buffering_t
Data required to perform duplex buffer pipelining.
Definition TTL_duplex_scheme.h:326

TTL_duplex_const_char_tensor_buffering_t::prev_out_tensors
struct TTL_duplex_const_char_tensor_buffering_t::@064104237117137030117227234105063000235053070337 prev_out_tensors
Store of the buffers used for the previous import/export cycles.

TTL_duplex_const_char_tensor_buffering_t::ext_tensor_in
TTL_ext_char_tensor_t ext_tensor_in
Definition TTL_duplex_scheme.h:331

TTL_duplex_const_char_tensor_buffering_t::ext_tensor_out
TTL_ext_char_tensor_t ext_tensor_out
Definition TTL_duplex_scheme.h:332

TTL_duplex_const_char_tensor_buffering_t::int_base
__local char * int_base[2]
Definition TTL_duplex_scheme.h:330

TTL_duplex_const_char_tensor_buffering_t::common
struct TTL_duplex_const_char_tensor_buffering_t::@014370002005176330316356021025342266164365046356 common
The information that is common to all pipeline schemes.

TTL_duplex_const_char_tensor_buffering_t::index
int index
Definition TTL_duplex_scheme.h:328

TTL_duplex_const_char_tensor_buffering_t::events
TTL_event_t(* events)[2]
Definition TTL_duplex_scheme.h:335

TTL_duplex_const_char_tensor_buffering_t::to_export_from
TTL_const_int_char_tensor_t to_export_from
Definition TTL_duplex_scheme.h:345

TTL_duplex_const_char_tensor_buffering_t::to_export_to
TTL_ext_char_tensor_t to_export_to
Definition TTL_duplex_scheme.h:344

TTL_duplex_const_int_tensor_buffering_t
Data required to perform duplex buffer pipelining.
Definition TTL_duplex_scheme.h:841

TTL_duplex_const_int_tensor_buffering_t::int_base
__local int * int_base[2]
Definition TTL_duplex_scheme.h:845

TTL_duplex_const_int_tensor_buffering_t::events
TTL_event_t(* events)[2]
Definition TTL_duplex_scheme.h:850

TTL_duplex_const_int_tensor_buffering_t::ext_tensor_in
TTL_ext_int_tensor_t ext_tensor_in
Definition TTL_duplex_scheme.h:846

TTL_duplex_const_int_tensor_buffering_t::to_export_from
TTL_const_int_int_tensor_t to_export_from
Definition TTL_duplex_scheme.h:860

TTL_duplex_const_int_tensor_buffering_t::common
struct TTL_duplex_const_int_tensor_buffering_t::@113001270154235351227363303241326346012173053057 common
The information that is common to all pipeline schemes.

TTL_duplex_const_int_tensor_buffering_t::ext_tensor_out
TTL_ext_int_tensor_t ext_tensor_out
Definition TTL_duplex_scheme.h:847

TTL_duplex_const_int_tensor_buffering_t::index
int index
Definition TTL_duplex_scheme.h:843

TTL_duplex_const_int_tensor_buffering_t::prev_out_tensors
struct TTL_duplex_const_int_tensor_buffering_t::@071005333241035110075205003246037135223123217036 prev_out_tensors
Store of the buffers used for the previous import/export cycles.

TTL_duplex_const_int_tensor_buffering_t::to_export_to
TTL_ext_int_tensor_t to_export_to
Definition TTL_duplex_scheme.h:859

TTL_duplex_const_long_tensor_buffering_t
Data required to perform duplex buffer pipelining.
Definition TTL_duplex_scheme.h:1871

TTL_duplex_const_long_tensor_buffering_t::to_export_from
TTL_const_int_long_tensor_t to_export_from
Definition TTL_duplex_scheme.h:1890

TTL_duplex_const_long_tensor_buffering_t::ext_tensor_in
TTL_ext_long_tensor_t ext_tensor_in
Definition TTL_duplex_scheme.h:1876

TTL_duplex_const_long_tensor_buffering_t::to_export_to
TTL_ext_long_tensor_t to_export_to
Definition TTL_duplex_scheme.h:1889

TTL_duplex_const_long_tensor_buffering_t::index
int index
Definition TTL_duplex_scheme.h:1873

TTL_duplex_const_long_tensor_buffering_t::int_base
__local long * int_base[2]
Definition TTL_duplex_scheme.h:1875

TTL_duplex_const_long_tensor_buffering_t::prev_out_tensors
struct TTL_duplex_const_long_tensor_buffering_t::@227070227032076302161007067253066031170102114367 prev_out_tensors
Store of the buffers used for the previous import/export cycles.

TTL_duplex_const_long_tensor_buffering_t::events
TTL_event_t(* events)[2]
Definition TTL_duplex_scheme.h:1880

TTL_duplex_const_long_tensor_buffering_t::ext_tensor_out
TTL_ext_long_tensor_t ext_tensor_out
Definition TTL_duplex_scheme.h:1877

TTL_duplex_const_long_tensor_buffering_t::common
struct TTL_duplex_const_long_tensor_buffering_t::@036044252256072122062226156361073353275230253054 common
The information that is common to all pipeline schemes.

TTL_duplex_const_short_tensor_buffering_t
Data required to perform duplex buffer pipelining.
Definition TTL_duplex_scheme.h:1355

TTL_duplex_const_short_tensor_buffering_t::ext_tensor_in
TTL_ext_short_tensor_t ext_tensor_in
Definition TTL_duplex_scheme.h:1360

TTL_duplex_const_short_tensor_buffering_t::to_export_to
TTL_ext_short_tensor_t to_export_to
Definition TTL_duplex_scheme.h:1373

TTL_duplex_const_short_tensor_buffering_t::common
struct TTL_duplex_const_short_tensor_buffering_t::@024114216310342034132013367354036153175252073102 common
The information that is common to all pipeline schemes.

TTL_duplex_const_short_tensor_buffering_t::int_base
__local short * int_base[2]
Definition TTL_duplex_scheme.h:1359

TTL_duplex_const_short_tensor_buffering_t::events
TTL_event_t(* events)[2]
Definition TTL_duplex_scheme.h:1364

TTL_duplex_const_short_tensor_buffering_t::prev_out_tensors
struct TTL_duplex_const_short_tensor_buffering_t::@264127032051024102251273212160044224162003153127 prev_out_tensors
Store of the buffers used for the previous import/export cycles.

TTL_duplex_const_short_tensor_buffering_t::index
int index
Definition TTL_duplex_scheme.h:1357

TTL_duplex_const_short_tensor_buffering_t::to_export_from
TTL_const_int_short_tensor_t to_export_from
Definition TTL_duplex_scheme.h:1374

TTL_duplex_const_short_tensor_buffering_t::ext_tensor_out
TTL_ext_short_tensor_t ext_tensor_out
Definition TTL_duplex_scheme.h:1361

TTL_duplex_const_uchar_tensor_buffering_t
Data required to perform duplex buffer pipelining.
Definition TTL_duplex_scheme.h:583

TTL_duplex_const_uchar_tensor_buffering_t::to_export_to
TTL_ext_uchar_tensor_t to_export_to
Definition TTL_duplex_scheme.h:601

TTL_duplex_const_uchar_tensor_buffering_t::ext_tensor_out
TTL_ext_uchar_tensor_t ext_tensor_out
Definition TTL_duplex_scheme.h:589

TTL_duplex_const_uchar_tensor_buffering_t::ext_tensor_in
TTL_ext_uchar_tensor_t ext_tensor_in
Definition TTL_duplex_scheme.h:588

TTL_duplex_const_uchar_tensor_buffering_t::events
TTL_event_t(* events)[2]
Definition TTL_duplex_scheme.h:592

TTL_duplex_const_uchar_tensor_buffering_t::index
int index
Definition TTL_duplex_scheme.h:585

TTL_duplex_const_uchar_tensor_buffering_t::common
struct TTL_duplex_const_uchar_tensor_buffering_t::@354204013312214235270256355316357054356270012252 common
The information that is common to all pipeline schemes.

TTL_duplex_const_uchar_tensor_buffering_t::int_base
__local uchar * int_base[2]
Definition TTL_duplex_scheme.h:587

TTL_duplex_const_uchar_tensor_buffering_t::to_export_from
TTL_const_int_uchar_tensor_t to_export_from
Definition TTL_duplex_scheme.h:602

TTL_duplex_const_uchar_tensor_buffering_t::prev_out_tensors
struct TTL_duplex_const_uchar_tensor_buffering_t::@301074276244055064044151350374040042161001165123 prev_out_tensors
Store of the buffers used for the previous import/export cycles.

TTL_duplex_const_uint_tensor_buffering_t
Data required to perform duplex buffer pipelining.
Definition TTL_duplex_scheme.h:1098

TTL_duplex_const_uint_tensor_buffering_t::int_base
__local uint * int_base[2]
Definition TTL_duplex_scheme.h:1102

TTL_duplex_const_uint_tensor_buffering_t::events
TTL_event_t(* events)[2]
Definition TTL_duplex_scheme.h:1107

TTL_duplex_const_uint_tensor_buffering_t::ext_tensor_in
TTL_ext_uint_tensor_t ext_tensor_in
Definition TTL_duplex_scheme.h:1103

TTL_duplex_const_uint_tensor_buffering_t::prev_out_tensors
struct TTL_duplex_const_uint_tensor_buffering_t::@103023100132272271317246162104057304263260357164 prev_out_tensors
Store of the buffers used for the previous import/export cycles.

TTL_duplex_const_uint_tensor_buffering_t::common
struct TTL_duplex_const_uint_tensor_buffering_t::@321372255121242324312262334377057314147273272231 common
The information that is common to all pipeline schemes.

TTL_duplex_const_uint_tensor_buffering_t::to_export_to
TTL_ext_uint_tensor_t to_export_to
Definition TTL_duplex_scheme.h:1116

TTL_duplex_const_uint_tensor_buffering_t::index
int index
Definition TTL_duplex_scheme.h:1100

TTL_duplex_const_uint_tensor_buffering_t::to_export_from
TTL_const_int_uint_tensor_t to_export_from
Definition TTL_duplex_scheme.h:1117

TTL_duplex_const_uint_tensor_buffering_t::ext_tensor_out
TTL_ext_uint_tensor_t ext_tensor_out
Definition TTL_duplex_scheme.h:1104

TTL_duplex_const_ulong_tensor_buffering_t
Data required to perform duplex buffer pipelining.
Definition TTL_duplex_scheme.h:2128

TTL_duplex_const_ulong_tensor_buffering_t::to_export_to
TTL_ext_ulong_tensor_t to_export_to
Definition TTL_duplex_scheme.h:2146

TTL_duplex_const_ulong_tensor_buffering_t::to_export_from
TTL_const_int_ulong_tensor_t to_export_from
Definition TTL_duplex_scheme.h:2147

TTL_duplex_const_ulong_tensor_buffering_t::index
int index
Definition TTL_duplex_scheme.h:2130

TTL_duplex_const_ulong_tensor_buffering_t::prev_out_tensors
struct TTL_duplex_const_ulong_tensor_buffering_t::@043337237103076030355350231302054342062327342307 prev_out_tensors
Store of the buffers used for the previous import/export cycles.

TTL_duplex_const_ulong_tensor_buffering_t::ext_tensor_out
TTL_ext_ulong_tensor_t ext_tensor_out
Definition TTL_duplex_scheme.h:2134

TTL_duplex_const_ulong_tensor_buffering_t::common
struct TTL_duplex_const_ulong_tensor_buffering_t::@377050367376170127244301220122253317167004362016 common
The information that is common to all pipeline schemes.

TTL_duplex_const_ulong_tensor_buffering_t::ext_tensor_in
TTL_ext_ulong_tensor_t ext_tensor_in
Definition TTL_duplex_scheme.h:2133

TTL_duplex_const_ulong_tensor_buffering_t::int_base
__local ulong * int_base[2]
Definition TTL_duplex_scheme.h:2132

TTL_duplex_const_ulong_tensor_buffering_t::events
TTL_event_t(* events)[2]
Definition TTL_duplex_scheme.h:2137

TTL_duplex_const_ushort_tensor_buffering_t
Data required to perform duplex buffer pipelining.
Definition TTL_duplex_scheme.h:1613

TTL_duplex_const_ushort_tensor_buffering_t::to_export_from
TTL_const_int_ushort_tensor_t to_export_from
Definition TTL_duplex_scheme.h:1632

TTL_duplex_const_ushort_tensor_buffering_t::index
int index
Definition TTL_duplex_scheme.h:1615

TTL_duplex_const_ushort_tensor_buffering_t::to_export_to
TTL_ext_ushort_tensor_t to_export_to
Definition TTL_duplex_scheme.h:1631

TTL_duplex_const_ushort_tensor_buffering_t::prev_out_tensors
struct TTL_duplex_const_ushort_tensor_buffering_t::@242202362352162060271227351266124343164075120060 prev_out_tensors
Store of the buffers used for the previous import/export cycles.

TTL_duplex_const_ushort_tensor_buffering_t::int_base
__local ushort * int_base[2]
Definition TTL_duplex_scheme.h:1617

TTL_duplex_const_ushort_tensor_buffering_t::events
TTL_event_t(* events)[2]
Definition TTL_duplex_scheme.h:1622

TTL_duplex_const_ushort_tensor_buffering_t::ext_tensor_out
TTL_ext_ushort_tensor_t ext_tensor_out
Definition TTL_duplex_scheme.h:1619

TTL_duplex_const_ushort_tensor_buffering_t::ext_tensor_in
TTL_ext_ushort_tensor_t ext_tensor_in
Definition TTL_duplex_scheme.h:1618

TTL_duplex_const_ushort_tensor_buffering_t::common
struct TTL_duplex_const_ushort_tensor_buffering_t::@050015212130100113116344316351156367204324277022 common
The information that is common to all pipeline schemes.

TTL_duplex_const_void_tensor_buffering_t
Data required to perform duplex buffer pipelining.
Definition TTL_duplex_scheme.h:69

TTL_duplex_const_void_tensor_buffering_t::to_export_to
TTL_ext_void_tensor_t to_export_to
Definition TTL_duplex_scheme.h:87

TTL_duplex_const_void_tensor_buffering_t::int_base
__local void * int_base[2]
Definition TTL_duplex_scheme.h:73

TTL_duplex_const_void_tensor_buffering_t::ext_tensor_out
TTL_ext_void_tensor_t ext_tensor_out
Definition TTL_duplex_scheme.h:75

TTL_duplex_const_void_tensor_buffering_t::index
int index
Definition TTL_duplex_scheme.h:71

TTL_duplex_const_void_tensor_buffering_t::common
struct TTL_duplex_const_void_tensor_buffering_t::@010107256062204117376046335161267266024366237051 common
The information that is common to all pipeline schemes.

TTL_duplex_const_void_tensor_buffering_t::to_export_from
TTL_const_int_void_tensor_t to_export_from
Definition TTL_duplex_scheme.h:88

TTL_duplex_const_void_tensor_buffering_t::prev_out_tensors
struct TTL_duplex_const_void_tensor_buffering_t::@001051164243227154221035355330234222317337335265 prev_out_tensors
Store of the buffers used for the previous import/export cycles.

TTL_duplex_const_void_tensor_buffering_t::events
TTL_event_t(* events)[2]
Definition TTL_duplex_scheme.h:78

TTL_duplex_const_void_tensor_buffering_t::ext_tensor_in
TTL_ext_void_tensor_t ext_tensor_in
Definition TTL_duplex_scheme.h:74

TTL_ext_char_tensor_t
const and non-const tensors in the appropriate address space
Definition TTL_int_ext_typed_tensors.h:847

TTL_ext_char_tensor_t::layout
TTL_layout_t layout
Definition TTL_int_ext_typed_tensors.h:850

TTL_ext_char_tensor_t::elem_size
TTL_dim_t elem_size
Definition TTL_int_ext_typed_tensors.h:849

TTL_ext_char_tensor_t::base
__global char * base
Definition TTL_int_ext_typed_tensors.h:848

TTL_ext_int_tensor_t
const and non-const tensors in the appropriate address space
Definition TTL_int_ext_typed_tensors.h:2455

TTL_ext_int_tensor_t::layout
TTL_layout_t layout
Definition TTL_int_ext_typed_tensors.h:2458

TTL_ext_int_tensor_t::elem_size
TTL_dim_t elem_size
Definition TTL_int_ext_typed_tensors.h:2457

TTL_ext_int_tensor_t::base
__global int * base
Definition TTL_int_ext_typed_tensors.h:2456

TTL_ext_long_tensor_t
const and non-const tensors in the appropriate address space
Definition TTL_int_ext_typed_tensors.h:5671

TTL_ext_long_tensor_t::layout
TTL_layout_t layout
Definition TTL_int_ext_typed_tensors.h:5674

TTL_ext_long_tensor_t::base
__global long * base
Definition TTL_int_ext_typed_tensors.h:5672

TTL_ext_long_tensor_t::elem_size
TTL_dim_t elem_size
Definition TTL_int_ext_typed_tensors.h:5673

TTL_ext_short_tensor_t
const and non-const tensors in the appropriate address space
Definition TTL_int_ext_typed_tensors.h:4062

TTL_ext_short_tensor_t::elem_size
TTL_dim_t elem_size
Definition TTL_int_ext_typed_tensors.h:4064

TTL_ext_short_tensor_t::base
__global short * base
Definition TTL_int_ext_typed_tensors.h:4063

TTL_ext_short_tensor_t::layout
TTL_layout_t layout
Definition TTL_int_ext_typed_tensors.h:4065

TTL_ext_uchar_tensor_t
const and non-const tensors in the appropriate address space
Definition TTL_int_ext_typed_tensors.h:1651

TTL_ext_uchar_tensor_t::base
__global uchar * base
Definition TTL_int_ext_typed_tensors.h:1652

TTL_ext_uchar_tensor_t::layout
TTL_layout_t layout
Definition TTL_int_ext_typed_tensors.h:1654

TTL_ext_uchar_tensor_t::elem_size
TTL_dim_t elem_size
Definition TTL_int_ext_typed_tensors.h:1653

TTL_ext_uint_tensor_t
const and non-const tensors in the appropriate address space
Definition TTL_int_ext_typed_tensors.h:3258

TTL_ext_uint_tensor_t::layout
TTL_layout_t layout
Definition TTL_int_ext_typed_tensors.h:3261

TTL_ext_uint_tensor_t::base
__global uint * base
Definition TTL_int_ext_typed_tensors.h:3259

TTL_ext_uint_tensor_t::elem_size
TTL_dim_t elem_size
Definition TTL_int_ext_typed_tensors.h:3260

TTL_ext_ulong_tensor_t
const and non-const tensors in the appropriate address space
Definition TTL_int_ext_typed_tensors.h:6475

TTL_ext_ulong_tensor_t::layout
TTL_layout_t layout
Definition TTL_int_ext_typed_tensors.h:6478

TTL_ext_ulong_tensor_t::elem_size
TTL_dim_t elem_size
Definition TTL_int_ext_typed_tensors.h:6477

TTL_ext_ulong_tensor_t::base
__global ulong * base
Definition TTL_int_ext_typed_tensors.h:6476

TTL_ext_ushort_tensor_t
const and non-const tensors in the appropriate address space
Definition TTL_int_ext_typed_tensors.h:4866

TTL_ext_ushort_tensor_t::elem_size
TTL_dim_t elem_size
Definition TTL_int_ext_typed_tensors.h:4868

TTL_ext_ushort_tensor_t::layout
TTL_layout_t layout
Definition TTL_int_ext_typed_tensors.h:4869

TTL_ext_ushort_tensor_t::base
__global ushort * base
Definition TTL_int_ext_typed_tensors.h:4867

TTL_ext_void_tensor_t
const and non-const tensors in the appropriate address space
Definition TTL_int_ext_typed_tensors.h:43

TTL_ext_void_tensor_t::layout
TTL_layout_t layout
Definition TTL_int_ext_typed_tensors.h:46

TTL_ext_void_tensor_t::elem_size
TTL_dim_t elem_size
Definition TTL_int_ext_typed_tensors.h:45

TTL_ext_void_tensor_t::base
__global void * base
Definition TTL_int_ext_typed_tensors.h:44

TTL_int_char_sub_tensor_t
const and non-const sub tensors in the appropriate address space
Definition TTL_int_ext_typed_tensors.h:8462

TTL_int_char_sub_tensor_t::tensor
TTL_int_char_tensor_t tensor
Definition TTL_int_ext_typed_tensors.h:8473

TTL_int_int_sub_tensor_t
const and non-const sub tensors in the appropriate address space
Definition TTL_int_ext_typed_tensors.h:10070

TTL_int_int_sub_tensor_t::tensor
TTL_int_int_tensor_t tensor
Definition TTL_int_ext_typed_tensors.h:10081

TTL_int_long_sub_tensor_t
const and non-const sub tensors in the appropriate address space
Definition TTL_int_ext_typed_tensors.h:13286

TTL_int_long_sub_tensor_t::tensor
TTL_int_long_tensor_t tensor
Definition TTL_int_ext_typed_tensors.h:13297

TTL_int_short_sub_tensor_t
const and non-const sub tensors in the appropriate address space
Definition TTL_int_ext_typed_tensors.h:11677

TTL_int_short_sub_tensor_t::tensor
TTL_int_short_tensor_t tensor
Definition TTL_int_ext_typed_tensors.h:11688

TTL_int_uchar_sub_tensor_t
const and non-const sub tensors in the appropriate address space
Definition TTL_int_ext_typed_tensors.h:9266

TTL_int_uchar_sub_tensor_t::tensor
TTL_int_uchar_tensor_t tensor
Definition TTL_int_ext_typed_tensors.h:9277

TTL_int_uint_sub_tensor_t
const and non-const sub tensors in the appropriate address space
Definition TTL_int_ext_typed_tensors.h:10873

TTL_int_uint_sub_tensor_t::tensor
TTL_int_uint_tensor_t tensor
Definition TTL_int_ext_typed_tensors.h:10884

TTL_int_ulong_sub_tensor_t
const and non-const sub tensors in the appropriate address space
Definition TTL_int_ext_typed_tensors.h:14090

TTL_int_ulong_sub_tensor_t::tensor
TTL_int_ulong_tensor_t tensor
Definition TTL_int_ext_typed_tensors.h:14101

TTL_int_ushort_sub_tensor_t
const and non-const sub tensors in the appropriate address space
Definition TTL_int_ext_typed_tensors.h:12482

TTL_int_ushort_sub_tensor_t::tensor
TTL_int_ushort_tensor_t tensor
Definition TTL_int_ext_typed_tensors.h:12493

TTL_int_void_sub_tensor_t
const and non-const sub tensors in the appropriate address space
Definition TTL_int_ext_typed_tensors.h:7658

TTL_int_void_sub_tensor_t::tensor
TTL_int_void_tensor_t tensor
Definition TTL_int_ext_typed_tensors.h:7669

TTL_io_char_tensor_t
Describes a pair of internal Tensors after an operation.
Definition TTL_schemes_common.h:132

TTL_io_int_tensor_t
Describes a pair of internal Tensors after an operation.
Definition TTL_schemes_common.h:266

TTL_io_long_tensor_t
Describes a pair of internal Tensors after an operation.
Definition TTL_schemes_common.h:534

TTL_io_short_tensor_t
Describes a pair of internal Tensors after an operation.
Definition TTL_schemes_common.h:400

TTL_io_uchar_tensor_t
Describes a pair of internal Tensors after an operation.
Definition TTL_schemes_common.h:199

TTL_io_uint_tensor_t
Describes a pair of internal Tensors after an operation.
Definition TTL_schemes_common.h:333

TTL_io_ulong_tensor_t
Describes a pair of internal Tensors after an operation.
Definition TTL_schemes_common.h:601

TTL_io_ushort_tensor_t
Describes a pair of internal Tensors after an operation.
Definition TTL_schemes_common.h:467

TTL_io_void_tensor_t
Describes a pair of internal Tensors after an operation.
Definition TTL_schemes_common.h:65

TTL_layout_t
Description of a Tensor layout in memory.
Definition TTL_tensors_common.h:50

TTL_shape_t::width
TTL_dim_t width
Number of elements along dimension x.
Definition tensors/TTL_types.h:31

TTL_shape_t::height
TTL_dim_t height
Number of rows along dimension y.
Definition tensors/TTL_types.h:32

TTL_tile_t
Definition TTL_tiles.h:124

TTL_tile_t::offset
TTL_offset_t offset
Definition TTL_tiles.h:126

TTL_tile_t::shape
TTL_shape_t shape
Definition TTL_tiles.h:125