diff --git a/Kconfig b/Kconfig index 621f1a15fc..aacc6291f8 100644 --- a/Kconfig +++ b/Kconfig @@ -363,6 +363,8 @@ menu "LVGL configuration" bool "1: NEON" config LV_DRAW_SW_ASM_HELIUM bool "2: HELIUM" + config LV_DRAW_SW_ASM_RISCV_V + bool "3: RISC-V Vector" config LV_DRAW_SW_ASM_CUSTOM bool "255: CUSTOM" endchoice @@ -372,6 +374,7 @@ menu "LVGL configuration" default 0 if LV_DRAW_SW_ASM_NONE default 1 if LV_DRAW_SW_ASM_NEON default 2 if LV_DRAW_SW_ASM_HELIUM + default 3 if LV_DRAW_SW_ASM_RISCV_V default 255 if LV_DRAW_SW_ASM_CUSTOM config LV_DRAW_SW_ASM_CUSTOM_INCLUDE diff --git a/scripts/lv_conf_internal_gen.py b/scripts/lv_conf_internal_gen.py index 7e0462db08..24fb05ae5b 100755 --- a/scripts/lv_conf_internal_gen.py +++ b/scripts/lv_conf_internal_gen.py @@ -66,6 +66,7 @@ fout.write( #define LV_DRAW_SW_ASM_NONE 0 #define LV_DRAW_SW_ASM_NEON 1 #define LV_DRAW_SW_ASM_HELIUM 2 +#define LV_DRAW_SW_ASM_RISCV_V 3 #define LV_DRAW_SW_ASM_CUSTOM 255 #define LV_NEMA_HAL_CUSTOM 0 diff --git a/src/draw/sw/blend/lv_draw_sw_blend_to_rgb888.c b/src/draw/sw/blend/lv_draw_sw_blend_to_rgb888.c index 7a9daa0839..c92ce07df2 100644 --- a/src/draw/sw/blend/lv_draw_sw_blend_to_rgb888.c +++ b/src/draw/sw/blend/lv_draw_sw_blend_to_rgb888.c @@ -22,6 +22,8 @@ #include "neon/lv_blend_neon.h" #elif LV_USE_DRAW_SW_ASM == LV_DRAW_SW_ASM_HELIUM #include "helium/lv_blend_helium.h" +#elif LV_USE_DRAW_SW_ASM == LV_DRAW_SW_ASM_RISCV_V + #include "riscv_v/lv_blend_riscv_v.h" #elif LV_USE_DRAW_SW_ASM == LV_DRAW_SW_ASM_CUSTOM #include LV_DRAW_SW_ASM_CUSTOM_INCLUDE #endif diff --git a/src/draw/sw/blend/riscv_v/lv_blend_riscv_v.h b/src/draw/sw/blend/riscv_v/lv_blend_riscv_v.h new file mode 100644 index 0000000000..eb1d3ee8d3 --- /dev/null +++ b/src/draw/sw/blend/riscv_v/lv_blend_riscv_v.h @@ -0,0 +1,25 @@ +/** + * @file lv_blend_riscv_v.h + * RISC-V Vector extension blend header + */ + +#ifndef LV_BLEND_RISCV_V_H +#define LV_BLEND_RISCV_V_H + +#ifdef __cplusplus +extern "C" { +#endif + +#include "../lv_draw_sw_blend.h" + +#if LV_USE_DRAW_SW_ASM == LV_DRAW_SW_ASM_RISCV_V + +#include "lv_draw_sw_blend_riscv_v_to_rgb888.h" + +#endif /* LV_USE_DRAW_SW_ASM == LV_DRAW_SW_ASM_RISCV_V */ + +#ifdef __cplusplus +} +#endif + +#endif /* LV_BLEND_RISCV_V_H */ diff --git a/src/draw/sw/blend/riscv_v/lv_blend_riscv_v_private.h b/src/draw/sw/blend/riscv_v/lv_blend_riscv_v_private.h new file mode 100644 index 0000000000..357d4d4373 --- /dev/null +++ b/src/draw/sw/blend/riscv_v/lv_blend_riscv_v_private.h @@ -0,0 +1,396 @@ +/** + * @file lv_blend_riscv_v_private.h + * Common macros and utilities for RISC-V Vector Extension (RVV 1.0) blend operations + * + * This header provides reusable RVV macros for: + * - Segmented load/store operations (RGB888/XRGB8888/RGB565) + * - Alpha blending with scalar or vector alpha + * - Color format conversions (RGB565 <-> RGB888) + * - Effective alpha calculations (alpha, mask, opa combinations) + */ + +#ifndef LV_BLEND_RISCV_V_PRIVATE_H +#define LV_BLEND_RISCV_V_PRIVATE_H + +#ifdef __cplusplus +extern "C" { +#endif + +/********************* + * INCLUDES + *********************/ + +#include "../../../../lv_conf_internal.h" +#if LV_USE_DRAW_SW_ASM == LV_DRAW_SW_ASM_RISCV_V + +/* Try to use real RVV, fall back to emulation if not available */ +#ifdef __riscv_v +#include +#else +/* No real RVV available, use emulation */ +#include "lv_blend_riscv_vector_emulation.h" +#endif + +/********************* + * DEFINES + *********************/ + +/********************** + * RVV SEGMENTED LOAD/STORE MACROS + * + * Emulate segmented load/store using stride operations. + * Compatible with compilers that don't support RVV 1.0 tuple types. + **********************/ + +/* RGB888: 3 channels (B,G,R) with stride=3 */ +#define LV_RVV_VLSEG3E8_U8M2(base, vl, v_b, v_g, v_r) \ + do { \ + (v_b) = __riscv_vlse8_v_u8m2((base) + 0, 3, (vl)); \ + (v_g) = __riscv_vlse8_v_u8m2((base) + 1, 3, (vl)); \ + (v_r) = __riscv_vlse8_v_u8m2((base) + 2, 3, (vl)); \ + } while(0) + +#define LV_RVV_VSSEG3E8_U8M2(base, v_b, v_g, v_r, vl) \ + do { \ + __riscv_vsse8_v_u8m2((base) + 0, 3, (v_b), (vl)); \ + __riscv_vsse8_v_u8m2((base) + 1, 3, (v_g), (vl)); \ + __riscv_vsse8_v_u8m2((base) + 2, 3, (v_r), (vl)); \ + } while(0) + +/* XRGB8888/ARGB8888: 4 channels (B,G,R,X) with stride=4 */ +#define LV_RVV_VLSEG4E8_U8M2(base, vl, v_b, v_g, v_r, v_x) \ + do { \ + (v_b) = __riscv_vlse8_v_u8m2((base) + 0, 4, (vl)); \ + (v_g) = __riscv_vlse8_v_u8m2((base) + 1, 4, (vl)); \ + (v_r) = __riscv_vlse8_v_u8m2((base) + 2, 4, (vl)); \ + (v_x) = __riscv_vlse8_v_u8m2((base) + 3, 4, (vl)); \ + } while(0) + +#define LV_RVV_VSSEG4E8_U8M2(base, v_b, v_g, v_r, v_x, vl) \ + do { \ + __riscv_vsse8_v_u8m2((base) + 0, 4, (v_b), (vl)); \ + __riscv_vsse8_v_u8m2((base) + 1, 4, (v_g), (vl)); \ + __riscv_vsse8_v_u8m2((base) + 2, 4, (v_r), (vl)); \ + __riscv_vsse8_v_u8m2((base) + 3, 4, (v_x), (vl)); \ + } while(0) + +/********************** + * RGB565 <-> RGB888 CONVERSION MACROS + * + * RGB565 format: RRRRRGGGGGGBBBBB (5-6-5 bits) + * Conversion formulas: + * R8 = (R5 * 2106) >> 8 (2106 ≈ 255/31 * 256) + * G8 = (G6 * 1037) >> 8 (1037 ≈ 255/63 * 256) + * B8 = (B5 * 2106) >> 8 + **********************/ + +/* Extract and convert RGB565 to separate R8, G8, B8 channels (16-bit width) */ +#define LV_RVV_RGB565_TO_RGB888_U16M2(v_rgb565, v_r8, v_g8, v_b8, vl) \ + do { \ + vuint16m2_t _r5 = __riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2((v_rgb565), 11, (vl)), 0x1F, (vl)); \ + vuint16m2_t _g6 = __riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2((v_rgb565), 5, (vl)), 0x3F, (vl)); \ + vuint16m2_t _b5 = __riscv_vand_vx_u16m2((v_rgb565), 0x1F, (vl)); \ + (v_r8) = __riscv_vsrl_vx_u16m2(__riscv_vmul_vx_u16m2(_r5, 2106, (vl)), 8, (vl)); \ + (v_g8) = __riscv_vsrl_vx_u16m2(__riscv_vmul_vx_u16m2(_g6, 1037, (vl)), 8, (vl)); \ + (v_b8) = __riscv_vsrl_vx_u16m2(__riscv_vmul_vx_u16m2(_b5, 2106, (vl)), 8, (vl)); \ + } while(0) + +/* Convert R8, G8, B8 to RGB565 (16-bit) */ +#define LV_RVV_RGB888_TO_RGB565_U16M2(v_r8, v_g8, v_b8, v_rgb565, vl) \ + do { \ + vuint16m2_t _r5 = __riscv_vsrl_vx_u16m2((v_r8), 3, (vl)); \ + vuint16m2_t _g6 = __riscv_vsrl_vx_u16m2((v_g8), 2, (vl)); \ + vuint16m2_t _b5 = __riscv_vsrl_vx_u16m2((v_b8), 3, (vl)); \ + (v_rgb565) = __riscv_vor_vv_u16m2(__riscv_vsll_vx_u16m2(_r5, 11, (vl)), \ + __riscv_vor_vv_u16m2(__riscv_vsll_vx_u16m2(_g6, 5, (vl)), _b5, (vl)), (vl)); \ + } while(0) + +/********************** + * ALPHA BLENDING MACROS + * + * Standard blend formula: result = (src * alpha + dst * (255 - alpha)) >> 8 + * + * Using vwmaccu (widening multiply-accumulate unsigned): + * tmp = dst * (255 - alpha) // Initialize with dst contribution + * tmp = tmp + src * alpha // vwmaccu adds src contribution + * result = tmp >> 8 + * + * This reduces operations by combining multiply and add. + **********************/ + +/** + * Blend single channel using vwmaccu (8-bit src/dst -> 16-bit intermediate) + * LMUL relationship: m1 -> m2, m2 -> m4 + */ +#define LV_RVV_BLEND_CHANNEL_U8M1_TO_U16M2(v_src, v_dst, alpha, v_result, vl) \ + do { \ + uint8_t _alpha_inv = 255 - (alpha); \ + vuint16m2_t _tmp = __riscv_vwmulu_vx_u16m2((v_dst), _alpha_inv, (vl)); \ + _tmp = __riscv_vwmaccu_vx_u16m2(_tmp, (alpha), (v_src), (vl)); \ + (v_result) = __riscv_vnsrl_wx_u8m1(_tmp, 8, (vl)); \ + } while(0) + +#define LV_RVV_BLEND_CHANNEL_U8M2_TO_U16M4(v_src, v_dst, alpha, v_result, vl) \ + do { \ + uint8_t _alpha_inv = 255 - (alpha); \ + vuint16m4_t _tmp = __riscv_vwmulu_vx_u16m4((v_dst), _alpha_inv, (vl)); \ + _tmp = __riscv_vwmaccu_vx_u16m4(_tmp, (alpha), (v_src), (vl)); \ + (v_result) = __riscv_vnsrl_wx_u8m2(_tmp, 8, (vl)); \ + } while(0) + +/** + * Blend single channel with vector alpha (per-pixel mask) + */ +#define LV_RVV_BLEND_CHANNEL_VMASK_U8M1_TO_U16M2(v_src, v_dst, v_alpha, v_result, vl) \ + do { \ + vuint8m1_t _v_alpha_inv = __riscv_vrsub_vx_u8m1((v_alpha), 255, (vl)); \ + vuint16m2_t _tmp = __riscv_vwmulu_vv_u16m2((v_dst), _v_alpha_inv, (vl)); \ + _tmp = __riscv_vwmaccu_vv_u16m2(_tmp, (v_alpha), (v_src), (vl)); \ + (v_result) = __riscv_vnsrl_wx_u8m1(_tmp, 8, (vl)); \ + } while(0) + +#define LV_RVV_BLEND_CHANNEL_VMASK_U8M2_TO_U16M4(v_src, v_dst, v_alpha, v_result, vl) \ + do { \ + vuint8m2_t _v_alpha_inv = __riscv_vrsub_vx_u8m2((v_alpha), 255, (vl)); \ + vuint16m4_t _tmp = __riscv_vwmulu_vv_u16m4((v_dst), _v_alpha_inv, (vl)); \ + _tmp = __riscv_vwmaccu_vv_u16m4(_tmp, (v_alpha), (v_src), (vl)); \ + (v_result) = __riscv_vnsrl_wx_u8m2(_tmp, 8, (vl)); \ + } while(0) + +/** + * Blend RGB channels with scalar alpha (all 3 channels at once) + * Uses m1->m2 widening + */ +#define LV_RVV_BLEND_RGB_U8M1(v_src_r, v_src_g, v_src_b, v_dst_r, v_dst_g, v_dst_b, \ + alpha, v_out_r, v_out_g, v_out_b, vl) \ +do { \ + LV_RVV_BLEND_CHANNEL_U8M1_TO_U16M2((v_src_r), (v_dst_r), (alpha), (v_out_r), (vl)); \ + LV_RVV_BLEND_CHANNEL_U8M1_TO_U16M2((v_src_g), (v_dst_g), (alpha), (v_out_g), (vl)); \ + LV_RVV_BLEND_CHANNEL_U8M1_TO_U16M2((v_src_b), (v_dst_b), (alpha), (v_out_b), (vl)); \ +} while(0) + +/** + * Blend RGB channels with scalar alpha (all 3 channels at once) + * Uses m2->m4 widening + */ +#define LV_RVV_BLEND_RGB_U8M2(v_src_r, v_src_g, v_src_b, v_dst_r, v_dst_g, v_dst_b, \ + alpha, v_out_r, v_out_g, v_out_b, vl) \ +do { \ + LV_RVV_BLEND_CHANNEL_U8M2_TO_U16M4((v_src_r), (v_dst_r), (alpha), (v_out_r), (vl)); \ + LV_RVV_BLEND_CHANNEL_U8M2_TO_U16M4((v_src_g), (v_dst_g), (alpha), (v_out_g), (vl)); \ + LV_RVV_BLEND_CHANNEL_U8M2_TO_U16M4((v_src_b), (v_dst_b), (alpha), (v_out_b), (vl)); \ +} while(0) + +/** + * Blend solid color (pre-multiplied) with destination RGB channels + * fg_color_opa: pre-computed (color * opa) for each channel + * opa_inv: 255 - opa + * Formula: result = (dst * opa_inv + fg_color_opa) >> 8 + * Uses m2->m4 widening + */ +#define LV_RVV_BLEND_SOLID_RGB_U8M2(v_dst_r, v_dst_g, v_dst_b, \ + fg_r_opa, fg_g_opa, fg_b_opa, opa_inv, \ + v_out_r, v_out_g, v_out_b, vl) \ +do { \ + vuint16m4_t _v_r16 = __riscv_vwmulu_vx_u16m4((v_dst_r), (opa_inv), (vl)); \ + vuint16m4_t _v_g16 = __riscv_vwmulu_vx_u16m4((v_dst_g), (opa_inv), (vl)); \ + vuint16m4_t _v_b16 = __riscv_vwmulu_vx_u16m4((v_dst_b), (opa_inv), (vl)); \ + _v_r16 = __riscv_vadd_vx_u16m4(_v_r16, (fg_r_opa), (vl)); \ + _v_g16 = __riscv_vadd_vx_u16m4(_v_g16, (fg_g_opa), (vl)); \ + _v_b16 = __riscv_vadd_vx_u16m4(_v_b16, (fg_b_opa), (vl)); \ + (v_out_r) = __riscv_vnsrl_wx_u8m2(_v_r16, 8, (vl)); \ + (v_out_g) = __riscv_vnsrl_wx_u8m2(_v_g16, 8, (vl)); \ + (v_out_b) = __riscv_vnsrl_wx_u8m2(_v_b16, 8, (vl)); \ +} while(0) + +/** + * Blend solid color (scalar) with destination RGB channels using vector alpha mask + * fg_r/g/b: foreground color scalar values + * v_alpha: per-pixel alpha values (vuint8m2_t) + * Formula: result = (fg * alpha + dst * (255 - alpha)) >> 8 + * Uses m2->m4 widening with vwmaccu for efficiency + */ +#define LV_RVV_BLEND_SOLID_RGB_VMASK_U8M2(v_dst_r, v_dst_g, v_dst_b, \ + fg_r, fg_g, fg_b, v_alpha, \ + v_out_r, v_out_g, v_out_b, vl) \ +do { \ + vuint8m2_t _v_alpha_inv = __riscv_vrsub_vx_u8m2((v_alpha), 255, (vl)); \ + vuint16m4_t _v_r16 = __riscv_vwmulu_vv_u16m4((v_dst_r), _v_alpha_inv, (vl)); \ + vuint16m4_t _v_g16 = __riscv_vwmulu_vv_u16m4((v_dst_g), _v_alpha_inv, (vl)); \ + vuint16m4_t _v_b16 = __riscv_vwmulu_vv_u16m4((v_dst_b), _v_alpha_inv, (vl)); \ + _v_r16 = __riscv_vwmaccu_vx_u16m4(_v_r16, (fg_r), (v_alpha), (vl)); \ + _v_g16 = __riscv_vwmaccu_vx_u16m4(_v_g16, (fg_g), (v_alpha), (vl)); \ + _v_b16 = __riscv_vwmaccu_vx_u16m4(_v_b16, (fg_b), (v_alpha), (vl)); \ + (v_out_r) = __riscv_vnsrl_wx_u8m2(_v_r16, 8, (vl)); \ + (v_out_g) = __riscv_vnsrl_wx_u8m2(_v_g16, 8, (vl)); \ + (v_out_b) = __riscv_vnsrl_wx_u8m2(_v_b16, 8, (vl)); \ +} while(0) + +/** + * Blend RGB channels with vector alpha (per-pixel mask) + */ +#define LV_RVV_BLEND_RGB_VMASK_U8M1(v_src_r, v_src_g, v_src_b, v_dst_r, v_dst_g, v_dst_b, \ + v_alpha, v_out_r, v_out_g, v_out_b, vl) \ +do { \ + LV_RVV_BLEND_CHANNEL_VMASK_U8M1_TO_U16M2((v_src_r), (v_dst_r), (v_alpha), (v_out_r), (vl)); \ + LV_RVV_BLEND_CHANNEL_VMASK_U8M1_TO_U16M2((v_src_g), (v_dst_g), (v_alpha), (v_out_g), (vl)); \ + LV_RVV_BLEND_CHANNEL_VMASK_U8M1_TO_U16M2((v_src_b), (v_dst_b), (v_alpha), (v_out_b), (vl)); \ +} while(0) + +/** + * Optimize blend results for zero and full mask cases (u8m1) + * When mask is 0, use destination; when mask is >= 255, use source + */ +#define LV_RVV_BLEND_OPTIMIZE_MASK_U8M1(v_r, v_g, v_b, v_src_r, v_src_g, v_src_b, \ + v_dst_r, v_dst_g, v_dst_b, v_mask, vl) \ +do { \ + vbool8_t _zero_mask = __riscv_vmseq_vx_u8m1_b8((v_mask), 0, (vl)); \ + vbool8_t _full_mask = __riscv_vmsgeu_vx_u8m1_b8((v_mask), LV_OPA_MAX, (vl)); \ + (v_b) = __riscv_vmerge_vvm_u8m1((v_b), (v_dst_b), _zero_mask, (vl)); \ + (v_g) = __riscv_vmerge_vvm_u8m1((v_g), (v_dst_g), _zero_mask, (vl)); \ + (v_r) = __riscv_vmerge_vvm_u8m1((v_r), (v_dst_r), _zero_mask, (vl)); \ + (v_b) = __riscv_vmerge_vvm_u8m1((v_b), (v_src_b), _full_mask, (vl)); \ + (v_g) = __riscv_vmerge_vvm_u8m1((v_g), (v_src_g), _full_mask, (vl)); \ + (v_r) = __riscv_vmerge_vvm_u8m1((v_r), (v_src_r), _full_mask, (vl)); \ +} while(0) + +/** + * Optimize blend results for zero and full mask cases (u8m2) with scalar source + * When mask is 0, use destination; when mask is >= 255, use scalar source + */ +#define LV_RVV_BLEND_OPTIMIZE_MASK_SCALAR_U8M2(v_r, v_g, v_b, src_r, src_g, src_b, \ + v_dst_r, v_dst_g, v_dst_b, v_mask, vl) \ +do { \ + vbool4_t _zero_mask = __riscv_vmseq_vx_u8m2_b4((v_mask), 0, (vl)); \ + vbool4_t _full_mask = __riscv_vmsgeu_vx_u8m2_b4((v_mask), LV_OPA_MAX, (vl)); \ + (v_b) = __riscv_vmerge_vvm_u8m2((v_b), (v_dst_b), _zero_mask, (vl)); \ + (v_g) = __riscv_vmerge_vvm_u8m2((v_g), (v_dst_g), _zero_mask, (vl)); \ + (v_r) = __riscv_vmerge_vvm_u8m2((v_r), (v_dst_r), _zero_mask, (vl)); \ + (v_b) = __riscv_vmerge_vxm_u8m2((v_b), (src_b), _full_mask, (vl)); \ + (v_g) = __riscv_vmerge_vxm_u8m2((v_g), (src_g), _full_mask, (vl)); \ + (v_r) = __riscv_vmerge_vxm_u8m2((v_r), (src_r), _full_mask, (vl)); \ +} while(0) + +/********************** + * EFFECTIVE ALPHA CALCULATION MACROS + * + * These macros compute the effective alpha from combinations of: + * - v_alpha: source alpha channel (per-pixel) + * - mask: mask value (per-pixel) + * - opa: global opacity (scalar) + * + * Formula: eff_alpha = (alpha * mask * opa) >> 16 + * Intermediate calculations use 16-bit to prevent overflow. + **********************/ + +/** + * Calculate effective alpha from source alpha and global opa + */ +#define LV_RVV_CALC_EFF_ALPHA_OPA_U8M1(v_src_a, opa, v_eff_a, vl) \ + do { \ + vuint16m2_t _v_alpha16 = __riscv_vzext_vf2_u16m2((v_src_a), (vl)); \ + vuint16m2_t _v_eff16 = __riscv_vsrl_vx_u16m2(__riscv_vmul_vx_u16m2(_v_alpha16, (opa), (vl)), 8, (vl)); \ + (v_eff_a) = __riscv_vnsrl_wx_u8m1(_v_eff16, 0, (vl)); \ + } while(0) + +/** + * Calculate effective alpha from source alpha and mask + */ +#define LV_RVV_CALC_EFF_ALPHA_MASK_U8M1(v_src_a, v_mask, v_eff_a, vl) \ + do { \ + vuint16m2_t _v_alpha16 = __riscv_vzext_vf2_u16m2((v_src_a), (vl)); \ + vuint16m2_t _v_mask16 = __riscv_vzext_vf2_u16m2((v_mask), (vl)); \ + vuint16m2_t _v_eff16 = __riscv_vsrl_vx_u16m2(__riscv_vmul_vv_u16m2(_v_alpha16, _v_mask16, (vl)), 8, (vl)); \ + (v_eff_a) = __riscv_vnsrl_wx_u8m1(_v_eff16, 0, (vl)); \ + } while(0) + +/** + * Calculate effective alpha from source alpha, mask, and global opa + * Formula: eff_alpha = (alpha * mask * opa) >> 16 + * Widen to u32m4 to avoid precision loss from double shift + */ +#define LV_RVV_CALC_EFF_ALPHA_MASK_OPA_U8M1(v_src_a, v_mask, opa, v_eff_a, vl) \ + do { \ + vuint16m2_t _v_alpha16 = __riscv_vzext_vf2_u16m2((v_src_a), (vl)); \ + vuint16m2_t _v_mask16 = __riscv_vzext_vf2_u16m2((v_mask), (vl)); \ + vuint16m2_t _v_prod16 = __riscv_vmul_vv_u16m2(_v_alpha16, _v_mask16, (vl)); \ + vuint32m4_t _v_prod32 = __riscv_vwmulu_vx_u32m4(_v_prod16, (opa), (vl)); \ + vuint16m2_t _v_eff16 = __riscv_vnsrl_wx_u16m2(_v_prod32, 16, (vl)); \ + (v_eff_a) = __riscv_vnsrl_wx_u8m1(_v_eff16, 0, (vl)); \ + } while(0) + +/********************** + * RGB/ARGB CHANNEL LOAD/STORE MACROS (for ARGB8888/RGB888/XRGB8888) + * + * Load/Store RGB channels to/from memory in different formats using m1 LMUL. + **********************/ + +#define LV_RVV_LOAD_ARGB8888_U8M1(ptr, x, v_b, v_g, v_r, v_a, vl) \ + do { \ + (v_b) = __riscv_vlse8_v_u8m1((ptr) + (x) * 4 + 0, 4, (vl)); \ + (v_g) = __riscv_vlse8_v_u8m1((ptr) + (x) * 4 + 1, 4, (vl)); \ + (v_r) = __riscv_vlse8_v_u8m1((ptr) + (x) * 4 + 2, 4, (vl)); \ + (v_a) = __riscv_vlse8_v_u8m1((ptr) + (x) * 4 + 3, 4, (vl)); \ + } while(0) + +#define LV_RVV_LOAD_RGB888_U8M1(ptr, x, v_b, v_g, v_r, vl) \ + do { \ + (v_b) = __riscv_vlse8_v_u8m1((ptr) + (x) * 3 + 0, 3, (vl)); \ + (v_g) = __riscv_vlse8_v_u8m1((ptr) + (x) * 3 + 1, 3, (vl)); \ + (v_r) = __riscv_vlse8_v_u8m1((ptr) + (x) * 3 + 2, 3, (vl)); \ + } while(0) + +#define LV_RVV_LOAD_XRGB8888_U8M1(ptr, x, v_b, v_g, v_r, vl) \ + do { \ + (v_b) = __riscv_vlse8_v_u8m1((ptr) + (x) * 4 + 0, 4, (vl)); \ + (v_g) = __riscv_vlse8_v_u8m1((ptr) + (x) * 4 + 1, 4, (vl)); \ + (v_r) = __riscv_vlse8_v_u8m1((ptr) + (x) * 4 + 2, 4, (vl)); \ + } while(0) + +#define LV_RVV_STORE_RGB888_U8M1(ptr, x, v_b, v_g, v_r, vl) \ + do { \ + __riscv_vsse8_v_u8m1((ptr) + (x) * 3 + 0, 3, (v_b), (vl)); \ + __riscv_vsse8_v_u8m1((ptr) + (x) * 3 + 1, 3, (v_g), (vl)); \ + __riscv_vsse8_v_u8m1((ptr) + (x) * 3 + 2, 3, (v_r), (vl)); \ + } while(0) + +#define LV_RVV_STORE_XRGB8888_U8M1(ptr, x, v_b, v_g, v_r, v_a, vl) \ + do { \ + __riscv_vsse8_v_u8m1((ptr) + (x) * 4 + 0, 4, (v_b), (vl)); \ + __riscv_vsse8_v_u8m1((ptr) + (x) * 4 + 1, 4, (v_g), (vl)); \ + __riscv_vsse8_v_u8m1((ptr) + (x) * 4 + 2, 4, (v_r), (vl)); \ + __riscv_vsse8_v_u8m1((ptr) + (x) * 4 + 3, 4, (v_a), (vl)); \ + } while(0) + +/********************** + * MACROS + **********************/ + +/********************** + * TYPEDEFS + **********************/ + +/********************** + * STATIC PROTOTYPES + **********************/ + +static inline void * LV_ATTRIBUTE_FAST_MEM drawbuf_next_row(const void * buf, uint32_t stride) +{ + return (void *)((uint8_t *)buf + stride); +} + +/********************** + * GLOBAL PROTOTYPES + **********************/ + +/********************** + * MACROS + **********************/ + +#endif + +#ifdef __cplusplus +} /*extern "C"*/ +#endif + +#endif /*LV_BLEND_RISCV_V_PRIVATE_H*/ diff --git a/src/draw/sw/blend/riscv_v/lv_blend_riscv_vector_emulation.h b/src/draw/sw/blend/riscv_v/lv_blend_riscv_vector_emulation.h new file mode 100644 index 0000000000..3ed621f1ef --- /dev/null +++ b/src/draw/sw/blend/riscv_v/lv_blend_riscv_vector_emulation.h @@ -0,0 +1,921 @@ +/** + * @file lv_blend_riscv_vector_emulation.h + * Software emulation of RISC-V Vector Extension (RVV 1.0) intrinsics + * + * This header provides pure C implementations of RVV intrinsics to enable + * testing and verification on non-RVV platforms. The implementations follow + * the RVV specification as documented in: + * https://dzaima.github.io/intrinsics-viewer/ + * + * Usage: + * 1. On systems without RVV support, include this header BEFORE + * 2. Or define RISCV_VECTOR_EMULATION before including actual + * 3. All __riscv_* functions will be emulated in software + * + * Limitations: + * - No performance optimization (this is software emulation) + * - Vector length (vl) is tracked but all operations work on single elements in a loop + * - Predication and masking are simplified but functionally correct + * - LMUL < 1 (fractional multipliers) are not supported + */ + +#ifndef LV_BLEND_RISCV_VECTOR_EMULATION_H +#define LV_BLEND_RISCV_VECTOR_EMULATION_H + +#ifdef __cplusplus +extern "C" { +#endif +#include "../../../../lv_conf_internal.h" +#if LV_USE_DRAW_SW_ASM == LV_DRAW_SW_ASM_RISCV_V + +#include +#include +#include + +/* ============================================================================ + * Vector Type Definitions + * ============================================================================ + * + * For emulation, we use structs that hold data and the current vector length. + * Real RVV is much more sophisticated, but this allows us to verify logic. + * + * Assumption: VLEN = 128 bits (a common RVV configuration) + * - e8m1: 128 bits / 8 bits = 16 elements + * - e8m2: 256 bits / 8 bits = 32 elements + * - e8m4: 512 bits / 8 bits = 64 elements + * - e16m1: 128 bits / 16 bits = 8 elements + * - e16m2: 256 bits / 16 bits = 16 elements + * - e16m4: 512 bits / 16 bits = 32 elements + * - e32m1: 128 bits / 32 bits = 4 elements + * - e32m2: 256 bits / 32 bits = 8 elements + * - e32m4: 512 bits / 32 bits = 16 elements + */ + +/* LMUL = 1 (1 vector register, VLEN=128 bits) */ +typedef struct { + uint8_t data[16]; /* 128 bits / 8 bits per element = 16 elements */ + size_t vl; /* Current vector length */ +} vuint8m1_t; + +typedef struct { + uint16_t data[8]; /* 128 bits / 16 bits per element = 8 elements */ + size_t vl; +} vuint16m1_t; + +typedef struct { + uint32_t data[4]; /* 128 bits / 32 bits per element = 4 elements */ + size_t vl; +} vuint32m1_t; + +typedef struct { + uint64_t data[2]; /* 128 bits / 64 bits per element = 2 elements */ + size_t vl; +} vuint64m1_t; + +/* LMUL = 2 (2 vector registers, total 256 bits) */ +typedef struct { + uint8_t data[32]; /* 256 bits / 8 bits per element = 32 elements */ + size_t vl; +} vuint8m2_t; + +typedef struct { + uint16_t data[16]; /* 256 bits / 16 bits per element = 16 elements */ + size_t vl; +} vuint16m2_t; + +typedef struct { + uint32_t data[8]; /* 256 bits / 32 bits per element = 8 elements */ + size_t vl; +} vuint32m2_t; + +typedef struct { + uint64_t data[4]; /* 256 bits / 64 bits per element = 4 elements */ + size_t vl; +} vuint64m2_t; + +/* LMUL = 4 (4 vector registers, total 512 bits) */ +typedef struct { + uint32_t data[16]; /* 512 bits / 32 bits per element = 16 elements */ + size_t vl; +} vuint32m4_t; + +typedef struct { + uint16_t data[32]; /* 512 bits / 16 bits per element = 32 elements */ + size_t vl; +} vuint16m4_t; + +/* LMUL = 8 (8 vector registers, total 1024 bits) */ +typedef struct { + uint8_t data[128]; /* 1024 bits / 8 bits per element = 128 elements */ + size_t vl; +} vuint8m8_t; + +/* Boolean/mask types (vbool4 means SEW/LMUL=4, for e8m2 -> 8/2=4) */ +typedef struct { + uint8_t data[32]; /* Same size as the vector it masks (e8m2 = 32 elements) */ + size_t vl; +} vbool4_t; + +typedef struct { + uint8_t data[16]; /* Mask for e8m1 (16 elements) */ + size_t vl; +} vbool8_t; + +/* ============================================================================ + * Vector Length Management + * ============================================================================ + * + * Operations: + * - __riscv_vsetvl_* : Set vector length for given element type and LMUL + * - __riscv_vsetvlmax_* : Get maximum vector length + */ + +/** + * Get maximum vector length for given element type and LMUL + * Based on VLEN=128 bits + */ +static inline size_t __riscv_vsetvlmax_e8m1(void) +{ + return 16; /* 128/8 = 16 */ +} +static inline size_t __riscv_vsetvlmax_e8m2(void) +{ + return 32; /* 256/8 = 32 */ +} +static inline size_t __riscv_vsetvlmax_e8m4(void) +{ + return 64; /* 512/8 = 64 */ +} + +static inline size_t __riscv_vsetvlmax_e16m1(void) +{ + return 8; /* 128/16 = 8 */ +} +static inline size_t __riscv_vsetvlmax_e16m2(void) +{ + return 16; /* 256/16 = 16 */ +} +static inline size_t __riscv_vsetvlmax_e16m4(void) +{ + return 32; /* 512/16 = 32 */ +} + +static inline size_t __riscv_vsetvlmax_e32m1(void) +{ + return 4; /* 128/32 = 4 */ +} +static inline size_t __riscv_vsetvlmax_e32m2(void) +{ + return 8; /* 256/32 = 8 */ +} +static inline size_t __riscv_vsetvlmax_e32m4(void) +{ + return 16; /* 512/32 = 16 */ +} + +static inline size_t __riscv_vsetvlmax_e8m8(void) +{ + return 128; /* 1024/8 = 128 */ +} + +static inline size_t __riscv_vsetvlmax_e64m1(void) +{ + return 2; /* 128/64 = 2 */ +} +static inline size_t __riscv_vsetvlmax_e64m2(void) +{ + return 4; /* 256/64 = 4 */ +} +static inline size_t __riscv_vsetvlmax_e64m4(void) +{ + return 8; /* 512/64 = 8 */ +} + +/** + * Set vector length to requested value (or max if requested > max) + */ +static inline size_t __riscv_vsetvl_e8m1(size_t avl) +{ + return avl > 16 ? 16 : avl; +} + +static inline size_t __riscv_vsetvl_e8m2(size_t avl) +{ + return avl > 32 ? 32 : avl; +} + +static inline size_t __riscv_vsetvl_e8m4(size_t avl) +{ + return avl > 64 ? 64 : avl; +} + +static inline size_t __riscv_vsetvl_e16m1(size_t avl) +{ + return avl > 8 ? 8 : avl; +} + +static inline size_t __riscv_vsetvl_e16m2(size_t avl) +{ + return avl > 16 ? 16 : avl; +} + +static inline size_t __riscv_vsetvl_e16m4(size_t avl) +{ + return avl > 32 ? 32 : avl; +} + +static inline size_t __riscv_vsetvl_e32m1(size_t avl) +{ + return avl > 4 ? 4 : avl; +} + +static inline size_t __riscv_vsetvl_e32m2(size_t avl) +{ + return avl > 8 ? 8 : avl; +} + +static inline size_t __riscv_vsetvl_e32m4(size_t avl) +{ + return avl > 16 ? 16 : avl; +} + +static inline size_t __riscv_vsetvl_e8m8(size_t avl) +{ + return avl > 128 ? 128 : avl; +} + +/* ============================================================================ + * Vector Initialize Operations (vmv.v.x - broadcast) + * ============================================================================ + */ + +/** + * vmv.v.x: Broadcast scalar to all vector elements + */ +static inline vuint8m1_t __riscv_vmv_v_x_u8m1(uint8_t src, size_t vl) +{ + vuint8m1_t res; + res.vl = vl > 16 ? 16 : vl; + for(size_t i = 0; i < res.vl; i++) { + res.data[i] = src; + } + return res; +} + +static inline vuint8m2_t __riscv_vmv_v_x_u8m2(uint8_t src, size_t vl) +{ + vuint8m2_t res; + res.vl = vl > 32 ? 32 : vl; + for(size_t i = 0; i < res.vl; i++) { + res.data[i] = src; + } + return res; +} + +static inline vuint16m2_t __riscv_vmv_v_x_u16m2(uint16_t src, size_t vl) +{ + vuint16m2_t res; + res.vl = vl > 16 ? 16 : vl; + for(size_t i = 0; i < res.vl; i++) { + res.data[i] = src; + } + return res; +} + +static inline vuint16m4_t __riscv_vmv_v_x_u16m4(uint16_t src, size_t vl) +{ + vuint16m4_t res; + res.vl = vl > 32 ? 32 : vl; + for(size_t i = 0; i < res.vl; i++) { + res.data[i] = src; + } + return res; +} + +static inline vuint32m4_t __riscv_vmv_v_x_u32m4(uint32_t src, size_t vl) +{ + vuint32m4_t res; + res.vl = vl > 16 ? 16 : vl; + for(size_t i = 0; i < res.vl; i++) { + res.data[i] = src; + } + return res; +} + +/* ============================================================================ + * Vector Load Operations (vle, vlse) + * ============================================================================ + */ + +/** + * vle8: Load vector of 8-bit elements with unit stride + */ +static inline vuint8m1_t __riscv_vle8_v_u8m1(const uint8_t * base, size_t vl) +{ + vuint8m1_t res; + res.vl = vl > 16 ? 16 : vl; + for(size_t i = 0; i < res.vl; i++) { + res.data[i] = base[i]; + } + return res; +} + +static inline vuint8m2_t __riscv_vle8_v_u8m2(const uint8_t * base, size_t vl) +{ + vuint8m2_t res; + res.vl = vl > 32 ? 32 : vl; + for(size_t i = 0; i < res.vl; i++) { + res.data[i] = base[i]; + } + return res; +} + +static inline vuint8m8_t __riscv_vle8_v_u8m8(const uint8_t * base, size_t vl) +{ + vuint8m8_t res; + res.vl = vl > 128 ? 128 : vl; + for(size_t i = 0; i < res.vl; i++) { + res.data[i] = base[i]; + } + return res; +} + +/** + * vlse8: Load vector with stride + * Load from address base + i * stride for each element i + */ +static inline vuint8m2_t __riscv_vlse8_v_u8m2(const uint8_t * base, ptrdiff_t stride, size_t vl) +{ + vuint8m2_t res; + res.vl = vl > 32 ? 32 : vl; + for(size_t i = 0; i < res.vl; i++) { + res.data[i] = *(const uint8_t *)((const char *)base + i * stride); + } + return res; +} + +static inline vuint8m1_t __riscv_vlse8_v_u8m1(const uint8_t * base, ptrdiff_t stride, size_t vl) +{ + vuint8m1_t res; + res.vl = vl > 16 ? 16 : vl; + for(size_t i = 0; i < res.vl; i++) { + res.data[i] = *(const uint8_t *)((const char *)base + i * stride); + } + return res; +} + +/** + * vle16: Load 16-bit vector + */ +static inline vuint16m2_t __riscv_vle16_v_u16m2(const uint16_t * base, size_t vl) +{ + vuint16m2_t res; + res.vl = vl > 16 ? 16 : vl; + for(size_t i = 0; i < res.vl; i++) { + res.data[i] = base[i]; + } + return res; +} + +/** + * vlse16: Load 16-bit vector with stride + */ +static inline vuint16m2_t __riscv_vlse16_v_u16m2(const uint16_t * base, ptrdiff_t stride, size_t vl) +{ + vuint16m2_t res; + res.vl = vl > 16 ? 16 : vl; + for(size_t i = 0; i < res.vl; i++) { + res.data[i] = *(const uint16_t *)((const char *)base + i * stride); + } + return res; +} + +/* ============================================================================ + * Vector Store Operations (vse, vsse) + * ============================================================================ + */ + +/** + * vse8: Store vector of 8-bit elements with unit stride + */ +static inline void __riscv_vse8_v_u8m1(uint8_t * base, vuint8m1_t v, size_t vl) +{ + for(size_t i = 0; i < v.vl && i < vl; i++) { + base[i] = v.data[i]; + } +} + +static inline void __riscv_vse8_v_u8m2(uint8_t * base, vuint8m2_t v, size_t vl) +{ + for(size_t i = 0; i < v.vl && i < vl; i++) { + base[i] = v.data[i]; + } +} + +static inline void __riscv_vse8_v_u8m8(uint8_t * base, vuint8m8_t v, size_t vl) +{ + for(size_t i = 0; i < v.vl && i < vl; i++) { + base[i] = v.data[i]; + } +} + +/** + * vsse8: Store vector with stride + * Store to address base + i * stride for each element i + */ +static inline void __riscv_vsse8_v_u8m2(uint8_t * base, ptrdiff_t stride, vuint8m2_t v, size_t vl) +{ + for(size_t i = 0; i < v.vl && i < vl; i++) { + *(uint8_t *)((char *)base + i * stride) = v.data[i]; + } +} + +static inline void __riscv_vsse8_v_u8m1(uint8_t * base, ptrdiff_t stride, vuint8m1_t v, size_t vl) +{ + for(size_t i = 0; i < v.vl && i < vl; i++) { + *(uint8_t *)((char *)base + i * stride) = v.data[i]; + } +} + +/** + * vse16: Store 16-bit vector + */ +static inline void __riscv_vse16_v_u16m2(uint16_t * base, vuint16m2_t v, size_t vl) +{ + for(size_t i = 0; i < v.vl && i < vl; i++) { + base[i] = v.data[i]; + } +} + +static inline void __riscv_vse16_v_u16m4(uint16_t * base, vuint16m4_t v, size_t vl) +{ + for(size_t i = 0; i < v.vl && i < vl; i++) { + base[i] = v.data[i]; + } +} + +/** + * vsse16: Store 16-bit vector with stride + */ +static inline void __riscv_vsse16_v_u16m2(uint16_t * base, ptrdiff_t stride, vuint16m2_t v, size_t vl) +{ + for(size_t i = 0; i < v.vl && i < vl; i++) { + *(uint16_t *)((char *)base + i * stride) = v.data[i]; + } +} + +/** + * vse32: Store 32-bit vector + */ +static inline void __riscv_vse32_v_u32m4(uint32_t * base, vuint32m4_t v, size_t vl) +{ + for(size_t i = 0; i < v.vl && i < vl; i++) { + base[i] = v.data[i]; + } +} + +/* ============================================================================ + * Vector Arithmetic Operations + * ============================================================================ + */ + +/** + * vmul: Vector multiply (scalar * vector) + */ +static inline vuint16m2_t __riscv_vmul_vx_u16m2(vuint16m2_t v, uint16_t x, size_t vl) +{ + vuint16m2_t res; + res.vl = v.vl; + for(size_t i = 0; i < v.vl && i < vl; i++) { + res.data[i] = v.data[i] * x; + } + return res; +} + +static inline vuint16m2_t __riscv_vmul_vv_u16m2(vuint16m2_t v1, vuint16m2_t v2, size_t vl) +{ + vuint16m2_t res; + res.vl = v1.vl; + for(size_t i = 0; i < v1.vl && i < vl; i++) { + res.data[i] = v1.data[i] * v2.data[i]; + } + return res; +} + +/** + * vwmulu: Vector widening multiply unsigned (scalar * vector, 8-bit -> 16-bit) + */ +static inline vuint16m4_t __riscv_vwmulu_vx_u16m4(vuint8m2_t v, uint8_t x, size_t vl) +{ + vuint16m4_t res; + res.vl = v.vl; + for(size_t i = 0; i < v.vl && i < vl; i++) { + res.data[i] = (uint16_t)v.data[i] * (uint16_t)x; + } + return res; +} + +static inline vuint16m2_t __riscv_vwmulu_vx_u16m2(vuint8m1_t v, uint8_t x, size_t vl) +{ + vuint16m2_t res; + res.vl = v.vl; + for(size_t i = 0; i < v.vl && i < vl; i++) { + res.data[i] = (uint16_t)v.data[i] * (uint16_t)x; + } + return res; +} + +static inline vuint16m2_t __riscv_vwmulu_vv_u16m2(vuint8m1_t v1, vuint8m1_t v2, size_t vl) +{ + vuint16m2_t res; + res.vl = v1.vl; + for(size_t i = 0; i < v1.vl && i < vl; i++) { + res.data[i] = (uint16_t)v1.data[i] * (uint16_t)v2.data[i]; + } + return res; +} + +static inline vuint16m4_t __riscv_vwmulu_vv_u16m4(vuint8m2_t v1, vuint8m2_t v2, size_t vl) +{ + vuint16m4_t res; + res.vl = v1.vl; + for(size_t i = 0; i < v1.vl && i < vl; i++) { + res.data[i] = (uint16_t)v1.data[i] * (uint16_t)v2.data[i]; + } + return res; +} + +static inline vuint16m2_t __riscv_vwmaccu_vx_u16m2(vuint16m2_t acc, uint8_t x, vuint8m1_t v, size_t vl) +{ + vuint16m2_t res = acc; + for(size_t i = 0; i < acc.vl && i < vl; i++) { + res.data[i] += (uint16_t)x * (uint16_t)v.data[i]; + } + return res; +} + +static inline vuint16m4_t __riscv_vwmaccu_vx_u16m4(vuint16m4_t acc, uint8_t x, vuint8m2_t v, size_t vl) +{ + vuint16m4_t res = acc; + for(size_t i = 0; i < acc.vl && i < vl; i++) { + res.data[i] += (uint16_t)x * (uint16_t)v.data[i]; + } + return res; +} + +static inline vuint16m2_t __riscv_vwmaccu_vv_u16m2(vuint16m2_t acc, vuint8m1_t v1, vuint8m1_t v2, size_t vl) +{ + vuint16m2_t res = acc; + for(size_t i = 0; i < acc.vl && i < vl; i++) { + res.data[i] += (uint16_t)v1.data[i] * (uint16_t)v2.data[i]; + } + return res; +} + +static inline vuint16m4_t __riscv_vwmaccu_vv_u16m4(vuint16m4_t acc, vuint8m2_t v1, vuint8m2_t v2, size_t vl) +{ + vuint16m4_t res = acc; + for(size_t i = 0; i < acc.vl && i < vl; i++) { + res.data[i] += (uint16_t)v1.data[i] * (uint16_t)v2.data[i]; + } + return res; +} + +/* ============================================================================ + * Vector Shift Operations + * ============================================================================ + */ + +/** + * vsrl: Vector shift right logical (scalar shift amount) + */ +static inline vuint16m2_t __riscv_vsrl_vx_u16m2(vuint16m2_t v, uint32_t x, size_t vl) +{ + vuint16m2_t res; + res.vl = v.vl; + for(size_t i = 0; i < v.vl && i < vl; i++) { + res.data[i] = v.data[i] >> x; + } + return res; +} + +static inline vuint16m4_t __riscv_vsrl_vx_u16m4(vuint16m4_t v, uint32_t x, size_t vl) +{ + vuint16m4_t res; + res.vl = v.vl; + for(size_t i = 0; i < v.vl && i < vl; i++) { + res.data[i] = v.data[i] >> x; + } + return res; +} + +static inline vuint8m1_t __riscv_vnsrl_wx_u8m1(vuint16m2_t v, uint32_t x, size_t vl) +{ + vuint8m1_t res; + res.vl = v.vl; + for(size_t i = 0; i < v.vl && i < vl; i++) { + res.data[i] = (uint8_t)(v.data[i] >> x); + } + return res; +} + +/** + * vnsrl: Vector narrow shift right logical + * Narrow from 16-bit to 8-bit with shift + */ +static inline vuint8m2_t __riscv_vnsrl_wx_u8m2(vuint16m4_t v, uint32_t x, size_t vl) +{ + vuint8m2_t res; + res.vl = v.vl; + for(size_t i = 0; i < v.vl && i < vl; i++) { + res.data[i] = (uint8_t)(v.data[i] >> x); + } + return res; +} + +/* ============================================================================ + * Vector Bitwise Operations + * ============================================================================ + */ + +/** + * vand: Vector bitwise AND (scalar) + */ +static inline vuint16m2_t __riscv_vand_vx_u16m2(vuint16m2_t v, uint16_t x, size_t vl) +{ + vuint16m2_t res; + res.vl = v.vl; + for(size_t i = 0; i < v.vl && i < vl; i++) { + res.data[i] = v.data[i] & x; + } + return res; +} + +/** + * vor: Vector bitwise OR (vector) + */ +static inline vuint16m2_t __riscv_vor_vv_u16m2(vuint16m2_t v1, vuint16m2_t v2, size_t vl) +{ + vuint16m2_t res; + res.vl = v1.vl; + for(size_t i = 0; i < v1.vl && i < vl; i++) { + res.data[i] = v1.data[i] | v2.data[i]; + } + return res; +} + +/* ============================================================================ + * Vector Shift Left Operations + * ============================================================================ + */ + +/** + * vsll: Vector shift left logical (scalar shift amount) + */ +static inline vuint16m2_t __riscv_vsll_vx_u16m2(vuint16m2_t v, uint32_t x, size_t vl) +{ + vuint16m2_t res; + res.vl = v.vl; + for(size_t i = 0; i < v.vl && i < vl; i++) { + res.data[i] = v.data[i] << x; + } + return res; +} + +/* ============================================================================ + * Vector Comparison Operations + * ============================================================================ + */ + +static inline vbool8_t __riscv_vmseq_vx_u8m1_b8(vuint8m1_t v, uint8_t x, size_t vl) +{ + vbool8_t res; + res.vl = v.vl; + for(size_t i = 0; i < v.vl && i < vl; i++) { + res.data[i] = (v.data[i] == x) ? 1 : 0; + } + return res; +} + +/** + * vmseq: Vector equal comparison (scalar) + * Returns boolean mask (1 if equal, 0 if not) + */ +static inline vbool4_t __riscv_vmseq_vx_u8m2_b4(vuint8m2_t v, uint8_t x, size_t vl) +{ + vbool4_t res; + res.vl = v.vl; + for(size_t i = 0; i < v.vl && i < vl; i++) { + res.data[i] = (v.data[i] == x) ? 1 : 0; + } + return res; +} + +/** + * vmsgeu: Vector greater or equal comparison (scalar) + * Returns boolean mask + */ +static inline vbool8_t __riscv_vmsgeu_vx_u8m1_b8(vuint8m1_t v, uint8_t x, size_t vl) +{ + vbool8_t res; + res.vl = v.vl; + for(size_t i = 0; i < v.vl && i < vl; i++) { + res.data[i] = (v.data[i] >= x) ? 1 : 0; + } + return res; +} + +static inline vbool4_t __riscv_vmsgeu_vx_u8m2_b4(vuint8m2_t v, uint8_t x, size_t vl) +{ + vbool4_t res; + res.vl = v.vl; + for(size_t i = 0; i < v.vl && i < vl; i++) { + res.data[i] = (v.data[i] >= x) ? 1 : 0; + } + return res; +} + +/* ============================================================================ + * Vector Merge Operations (Conditional) + * ============================================================================ + */ + +/** + * vmerge: Merge vector under predicate mask (vector paths) + * Result = mask ? v2 : v1 + */ +static inline vuint8m1_t __riscv_vmerge_vvm_u8m1(vuint8m1_t v1, vuint8m1_t v2, vbool8_t mask, size_t vl) +{ + vuint8m1_t res; + res.vl = v1.vl; + for(size_t i = 0; i < v1.vl && i < vl; i++) { + res.data[i] = mask.data[i] ? v2.data[i] : v1.data[i]; + } + return res; +} + +static inline vuint8m2_t __riscv_vmerge_vvm_u8m2(vuint8m2_t v1, vuint8m2_t v2, vbool4_t mask, size_t vl) +{ + vuint8m2_t res; + res.vl = v1.vl; + for(size_t i = 0; i < v1.vl && i < vl; i++) { + res.data[i] = mask.data[i] ? v2.data[i] : v1.data[i]; + } + return res; +} + +/** + * vmerge: Merge scalar under predicate mask (scalar path) + * Result = mask ? scalar : vector + */ +static inline vuint8m1_t __riscv_vmerge_vxm_u8m1(vuint8m1_t v, uint8_t x, vbool8_t mask, size_t vl) +{ + vuint8m1_t res; + res.vl = v.vl; + for(size_t i = 0; i < v.vl && i < vl; i++) { + res.data[i] = mask.data[i] ? x : v.data[i]; + } + return res; +} + +static inline vuint8m2_t __riscv_vmerge_vxm_u8m2(vuint8m2_t v, uint8_t x, vbool4_t mask, size_t vl) +{ + vuint8m2_t res; + res.vl = v.vl; + for(size_t i = 0; i < v.vl && i < vl; i++) { + res.data[i] = mask.data[i] ? x : v.data[i]; + } + return res; +} + +/* ========================================================================== + * Vector Reverse Subtract Operations + * ========================================================================= */ + +static inline vuint8m1_t __riscv_vrsub_vx_u8m1(vuint8m1_t v, uint8_t x, size_t vl) +{ + vuint8m1_t res; + res.vl = v.vl; + for(size_t i = 0; i < v.vl && i < vl; i++) { + res.data[i] = x - v.data[i]; + } + return res; +} + +static inline vuint8m2_t __riscv_vrsub_vx_u8m2(vuint8m2_t v, uint8_t x, size_t vl) +{ + vuint8m2_t res; + res.vl = v.vl; + for(size_t i = 0; i < v.vl && i < vl; i++) { + res.data[i] = x - v.data[i]; + } + return res; +} + +static inline vuint16m2_t __riscv_vrsub_vx_u16m2(vuint16m2_t v, uint16_t x, size_t vl) +{ + vuint16m2_t res; + res.vl = v.vl; + for(size_t i = 0; i < v.vl && i < vl; i++) { + res.data[i] = x - v.data[i]; + } + return res; +} + +/* ========================================================================== + * Vector Add Operations + * ========================================================================= */ + +static inline vuint8m1_t __riscv_vadd_vv_u8m1(vuint8m1_t v1, vuint8m1_t v2, size_t vl) +{ + vuint8m1_t res; + res.vl = v1.vl; + for(size_t i = 0; i < v1.vl && i < vl; i++) { + res.data[i] = v1.data[i] + v2.data[i]; + } + return res; +} + +static inline vuint16m4_t __riscv_vadd_vx_u16m4(vuint16m4_t v, uint16_t x, size_t vl) +{ + vuint16m4_t res; + res.vl = v.vl; + for(size_t i = 0; i < v.vl && i < vl; i++) { + res.data[i] = v.data[i] + x; + } + return res; +} + +static inline vuint16m2_t __riscv_vadd_vx_u16m2(vuint16m2_t v, uint16_t x, size_t vl) +{ + vuint16m2_t res; + res.vl = v.vl; + for(size_t i = 0; i < v.vl && i < vl; i++) { + res.data[i] = v.data[i] + x; + } + return res; +} + +/* ========================================================================== + * Vector Zero-Extend Operations + * ========================================================================= */ + +static inline vuint16m2_t __riscv_vzext_vf2_u16m2(vuint8m1_t v, size_t vl) +{ + vuint16m2_t res; + res.vl = v.vl; + for(size_t i = 0; i < v.vl && i < vl; i++) { + res.data[i] = (uint16_t)v.data[i]; + } + return res; +} + +/* ========================================================================== + * Widening multiply/accumulate for 16-bit -> 32-bit (m2 -> m4) + * ========================================================================= */ +static inline vuint32m4_t __riscv_vwmulu_vx_u32m4(vuint16m2_t v, uint32_t x, size_t vl) +{ + vuint32m4_t res; + res.vl = v.vl; + for(size_t i = 0; i < v.vl && i < vl; i++) { + res.data[i] = (uint32_t)v.data[i] * x; + } + return res; +} + +static inline vuint32m4_t __riscv_vwmulu_vv_u32m4(vuint16m2_t v1, vuint16m2_t v2, size_t vl) +{ + vuint32m4_t res; + res.vl = v1.vl; + for(size_t i = 0; i < v1.vl && i < vl; i++) { + res.data[i] = (uint32_t)v1.data[i] * (uint32_t)v2.data[i]; + } + return res; +} + +static inline vuint32m4_t __riscv_vwmaccu_vx_u32m4(vuint32m4_t acc, uint32_t x, vuint16m2_t v, size_t vl) +{ + vuint32m4_t res = acc; + for(size_t i = 0; i < acc.vl && i < vl; i++) { + res.data[i] += (uint32_t)x * (uint32_t)v.data[i]; + } + return res; +} + +static inline vuint16m2_t __riscv_vnsrl_wx_u16m2(vuint32m4_t v, uint32_t x, size_t vl) +{ + vuint16m2_t res; + res.vl = v.vl; + for(size_t i = 0; i < v.vl && i < vl; i++) { + res.data[i] = (uint16_t)(v.data[i] >> x); + } + return res; +} + +#endif /* LV_USE_DRAW_SW_ASM_RISCV_V */ + +#ifdef __cplusplus +} +#endif + +#endif /* LV_BLEND_RISCV_VECTOR_EMULATION_H */ diff --git a/src/draw/sw/blend/riscv_v/lv_draw_sw_blend_riscv_v_to_rgb888.c b/src/draw/sw/blend/riscv_v/lv_draw_sw_blend_riscv_v_to_rgb888.c new file mode 100644 index 0000000000..37e3b4e0ad --- /dev/null +++ b/src/draw/sw/blend/riscv_v/lv_draw_sw_blend_riscv_v_to_rgb888.c @@ -0,0 +1,1643 @@ +/** + * @file lv_draw_sw_blend_riscv_v_to_rgb888.c + * RGB888/XRGB8888 blend implementation for RISC-V Vector Extension (RVV 1.0) + * + * Supports both dest_px_size=3 (RGB888) and dest_px_size=4 (XRGB8888) + * Reference: lv_draw_sw_blend_neon_to_rgb888.c + * + * NOTE: All RVV blend logic is inlined to avoid passing vuint32m4_t as function + * parameters, which causes complex stack operations that can corrupt the stack. + */ + +/********************* + * INCLUDES + *********************/ +#include "lv_draw_sw_blend_riscv_v_to_rgb888.h" +#if LV_USE_DRAW_SW_ASM == LV_DRAW_SW_ASM_RISCV_V + +#include "../../../../misc/lv_color.h" +#include "../../../../misc/lv_types.h" +#include "../lv_draw_sw_blend_private.h" +#include "lv_blend_riscv_v_private.h" + +/********************* + * DEFINES + *********************/ + +/********************** + * TYPEDEFS + **********************/ + +/********************** + * STATIC PROTOTYPES + **********************/ + +/********************** + * GLOBAL FUNCTIONS + **********************/ + +/** + * Fill with solid color (no blending needed, opa >= 255) + */ +lv_result_t lv_draw_sw_blend_riscv_v_color_to_rgb888(lv_draw_sw_blend_fill_dsc_t * dsc, uint32_t dest_px_size) +{ + LV_ASSERT(dest_px_size == 3 || dest_px_size == 4); + LV_ASSERT(dsc->opa >= LV_OPA_MAX); + LV_ASSERT(dsc->mask_buf == NULL); + + const int32_t w = dsc->dest_w; + const int32_t h = dsc->dest_h; + const int32_t dest_stride = dsc->dest_stride; + uint8_t * dest_buf = dsc->dest_buf; + size_t vl; + + if(dest_px_size == 3) { + /* RGB888: 3 bytes per pixel (B, G, R) - use RVV segmented store */ + /* Initialize color vectors once with max vl */ + size_t vlmax = __riscv_vsetvlmax_e8m2(); + vuint8m2_t v_b = __riscv_vmv_v_x_u8m2(dsc->color.blue, vlmax); + vuint8m2_t v_g = __riscv_vmv_v_x_u8m2(dsc->color.green, vlmax); + vuint8m2_t v_r = __riscv_vmv_v_x_u8m2(dsc->color.red, vlmax); + + for(int32_t y = 0; y < h; y++) { + /* Process with RVV using segmented store for 3-byte pixels */ + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m2(w - x); + LV_RVV_VSSEG3E8_U8M2(dest_buf + x * 3, v_b, v_g, v_r, vl); + } + + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + } + } + else { /* dest_px_size == 4 */ + /* XRGB8888: 4 bytes per pixel */ + const uint32_t color32 = 0xFF000000 | ((uint32_t)dsc->color.red << 16) | + ((uint32_t)dsc->color.green << 8) | dsc->color.blue; + + /* Initialize color vector once with max vl */ + size_t vlmax = __riscv_vsetvlmax_e32m4(); + vuint32m4_t v_color = __riscv_vmv_v_x_u32m4(color32, vlmax); + + for(int32_t y = 0; y < h; y++) { + /* Process with RVV - use m4 to reduce register pressure */ + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e32m4(w - x); + __riscv_vse32_v_u32m4((uint32_t *)(dest_buf + x * 4), v_color, vl); + } + + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + } + } + + return LV_RESULT_OK; +} + +/** + * Fill with color and opacity (opa < 255) + * blend formula: result = (fg * opa + bg * (255 - opa)) >> 8 + */ +lv_result_t lv_draw_sw_blend_riscv_v_color_to_rgb888_with_opa(lv_draw_sw_blend_fill_dsc_t * dsc, uint32_t dest_px_size) +{ + LV_ASSERT(dest_px_size == 3 || dest_px_size == 4); + LV_ASSERT(dsc->opa < LV_OPA_MAX); + LV_ASSERT(dsc->mask_buf == NULL); + + const int32_t w = dsc->dest_w; + const int32_t h = dsc->dest_h; + const int32_t dest_stride = dsc->dest_stride; + const uint8_t opa = dsc->opa; + const uint8_t opa_inv = 255 - opa; + const uint16_t fg_b_opa = (uint16_t)dsc->color.blue * opa; + const uint16_t fg_g_opa = (uint16_t)dsc->color.green * opa; + const uint16_t fg_r_opa = (uint16_t)dsc->color.red * opa; + uint8_t * dest_buf = dsc->dest_buf; + size_t vl; + + /* Early exit if fully transparent */ + if(opa == 0) return LV_RESULT_OK; + + if(dest_px_size == 3) { + for(int32_t y = 0; y < h; y++) { + /* Process with RVV using segmented load/store for 3-byte pixels */ + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m2(w - x); + + /* Load destination B, G, R channels using segmented load */ + vuint8m2_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_VLSEG3E8_U8M2(dest_buf + x * 3, vl, v_dst_b, v_dst_g, v_dst_r); + + /* Blend solid color with destination */ + vuint8m2_t v_b, v_g, v_r; + LV_RVV_BLEND_SOLID_RGB_U8M2(v_dst_r, v_dst_g, v_dst_b, + fg_r_opa, fg_g_opa, fg_b_opa, opa_inv, + v_r, v_g, v_b, vl); + + /* Store result using segmented store */ + LV_RVV_VSSEG3E8_U8M2(dest_buf + x * 3, v_b, v_g, v_r, vl); + } + + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + } + } + else { /* dest_px_size == 4 */ + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m2(w - x); + + /* Load destination B, G, R, X channels using segmented load */ + vuint8m2_t v_dst_b, v_dst_g, v_dst_r, v_dst_x; + LV_RVV_VLSEG4E8_U8M2(dest_buf + x * 4, vl, v_dst_b, v_dst_g, v_dst_r, v_dst_x); + /* v_dst_x is X/Alpha, ignored for input */ + (void)v_dst_x; + + /* Blend solid color with destination */ + vuint8m2_t v_b, v_g, v_r; + LV_RVV_BLEND_SOLID_RGB_U8M2(v_dst_r, v_dst_g, v_dst_b, + fg_r_opa, fg_g_opa, fg_b_opa, opa_inv, + v_r, v_g, v_b, vl); + + vuint8m2_t v_x = __riscv_vmv_v_x_u8m2(0xFF, vl); /* Alpha = 0xFF */ + + /* Store result using segmented store */ + LV_RVV_VSSEG4E8_U8M2(dest_buf + x * 4, v_b, v_g, v_r, v_x, vl); + } + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + } + } + + return LV_RESULT_OK; +} + +/** + * Fill with color and per-pixel mask (opa >= 255) + */ +lv_result_t lv_draw_sw_blend_riscv_v_color_to_rgb888_with_mask(lv_draw_sw_blend_fill_dsc_t * dsc, uint32_t dest_px_size) +{ + LV_ASSERT(dest_px_size == 3 || dest_px_size == 4); + LV_ASSERT(dsc->opa >= LV_OPA_MAX); + LV_ASSERT(dsc->mask_buf != NULL); + + const int32_t w = dsc->dest_w; + const int32_t h = dsc->dest_h; + const int32_t dest_stride = dsc->dest_stride; + const int32_t mask_stride = dsc->mask_stride; + const uint8_t * mask_buf = dsc->mask_buf; + const uint8_t fg_b = dsc->color.blue; + const uint8_t fg_g = dsc->color.green; + const uint8_t fg_r = dsc->color.red; + uint8_t * dest_buf = dsc->dest_buf; + size_t vl; + + if(dest_px_size == 3) { + /* RGB888: 3 bytes per pixel - use RVV for blending with mask */ + for(int32_t y = 0; y < h; y++) { + /* Process with RVV using segmented load/store for 3-byte pixels */ + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m2(w - x); + + /* Load mask values */ + vuint8m2_t v_mask8 = __riscv_vle8_v_u8m2(&mask_buf[x], vl); + + /* Load destination B, G, R channels using segmented load */ + vuint8m2_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_VLSEG3E8_U8M2(dest_buf + x * 3, vl, v_dst_b, v_dst_g, v_dst_r); + + /* Blend solid color with mask */ + vuint8m2_t v_b, v_g, v_r; + LV_RVV_BLEND_SOLID_RGB_VMASK_U8M2(v_dst_r, v_dst_g, v_dst_b, + fg_r, fg_g, fg_b, v_mask8, + v_r, v_g, v_b, vl); + + /* Optional: Handle special cases for mask == 0 or mask >= 255. + * Without this, max error is ±1 (e.g., (x*255)>>8 ≈ x*0.996). + * For graphics rendering, ±1 error is typically acceptable. + * Uncomment below if exact values are required. */ + LV_RVV_BLEND_OPTIMIZE_MASK_SCALAR_U8M2(v_r, v_g, v_b, + fg_r, fg_g, fg_b, + v_dst_r, v_dst_g, v_dst_b, + v_mask8, vl); + /* Store result using segmented store */ + LV_RVV_VSSEG3E8_U8M2(dest_buf + x * 3, v_b, v_g, v_r, vl); + } + + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + mask_buf += mask_stride; + } + } + else { /* dest_px_size == 4 */ + /* XRGB8888: 4 bytes per pixel - use segmented load/store like RGB888 */ + for(int32_t y = 0; y < h; y++) { + /* Process with RVV using segmented load/store for 4-byte pixels */ + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m2(w - x); + + /* Load mask values */ + vuint8m2_t v_mask8 = __riscv_vle8_v_u8m2(&mask_buf[x], vl); + + /* Load destination B, G, R, X channels using segmented load */ + vuint8m2_t v_dst_b, v_dst_g, v_dst_r, v_dst_x; + LV_RVV_VLSEG4E8_U8M2(dest_buf + x * 4, vl, v_dst_b, v_dst_g, v_dst_r, v_dst_x); + /* v_dst_x is X/Alpha, ignored for input */ + (void)v_dst_x; + + /* Blend solid color with mask */ + vuint8m2_t v_b, v_g, v_r; + LV_RVV_BLEND_SOLID_RGB_VMASK_U8M2(v_dst_r, v_dst_g, v_dst_b, + fg_r, fg_g, fg_b, v_mask8, + v_r, v_g, v_b, vl); + vuint8m2_t v_x = __riscv_vmv_v_x_u8m2(0xFF, vl); /* Alpha = 0xFF */ + + /* Optional: Handle special cases for mask == 0 or mask >= 255. + * Without this, max error is ±1. Uncomment if exact values required. */ + LV_RVV_BLEND_OPTIMIZE_MASK_SCALAR_U8M2(v_r, v_g, v_b, + fg_r, fg_g, fg_b, + v_dst_r, v_dst_g, v_dst_b, + v_mask8, vl); + + /* Store result using segmented store */ + LV_RVV_VSSEG4E8_U8M2(dest_buf + x * 4, v_b, v_g, v_r, v_x, vl); + } + + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + mask_buf += mask_stride; + } + } + + return LV_RESULT_OK; +} + +/** + * Fill with color, opacity, and per-pixel mask + * Effective mix = (mask * opa) >> 8 + */ +lv_result_t lv_draw_sw_blend_riscv_v_color_to_rgb888_with_opa_mask(lv_draw_sw_blend_fill_dsc_t * dsc, + uint32_t dest_px_size) +{ + LV_ASSERT(dest_px_size == 3 || dest_px_size == 4); + LV_ASSERT(dsc->opa < LV_OPA_MAX); + LV_ASSERT(dsc->mask_buf != NULL); + + const int32_t w = dsc->dest_w; + const int32_t h = dsc->dest_h; + const int32_t dest_stride = dsc->dest_stride; + const uint8_t opa = dsc->opa; + const int32_t mask_stride = dsc->mask_stride; + const uint8_t * mask_buf = dsc->mask_buf; + uint8_t * dest_buf = dsc->dest_buf; + const uint8_t fg_b = dsc->color.blue; + const uint8_t fg_g = dsc->color.green; + const uint8_t fg_r = dsc->color.red; + size_t vl; + + /* Early exit if fully transparent */ + if(opa == 0) return LV_RESULT_OK; + + if(dest_px_size == 3) { + /* RGB888: 3 bytes per pixel - use RVV for blending with opa and mask */ + + for(int32_t y = 0; y < h; y++) { + /* Process with RVV using segmented load/store for 3-byte pixels */ + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m2(w - x); + + /* Load mask values */ + vuint8m2_t v_mask8 = __riscv_vle8_v_u8m2(&mask_buf[x], vl); + + /* Load destination B, G, R channels using segmented load */ + vuint8m2_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_VLSEG3E8_U8M2(dest_buf + x * 3, vl, v_dst_b, v_dst_g, v_dst_r); + + /* Compute mix = (mask * opa) >> 8 using widening multiply then narrow */ + vuint16m4_t v_mix16 = __riscv_vwmulu_vx_u16m4(v_mask8, opa, vl); + vuint8m2_t v_mix8 = __riscv_vnsrl_wx_u8m2(v_mix16, 8, vl); + + /* Blend solid color with mix (mask * opa) */ + vuint8m2_t v_b, v_g, v_r; + LV_RVV_BLEND_SOLID_RGB_VMASK_U8M2(v_dst_r, v_dst_g, v_dst_b, + fg_r, fg_g, fg_b, v_mix8, + v_r, v_g, v_b, vl); + + /* Optional: Handle special cases for mix == 0 or mix >= 255. + * Without this, max error is ±1. Uncomment if exact values required. */ + + LV_RVV_BLEND_OPTIMIZE_MASK_SCALAR_U8M2(v_r, v_g, v_b, + fg_r, fg_g, fg_b, + v_dst_r, v_dst_g, v_dst_b, + v_mix8, vl); + + /* Store result using segmented store */ + LV_RVV_VSSEG3E8_U8M2(dest_buf + x * 3, v_b, v_g, v_r, vl); + } + + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + mask_buf += mask_stride; + } + } + else { /* dest_px_size == 4 */ + /* XRGB8888: 4 bytes per pixel - use segmented load/store like RGB888 */ + + + for(int32_t y = 0; y < h; y++) { + /* Process with RVV using segmented load/store for 4-byte pixels */ + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m2(w - x); + + /* Load mask values */ + vuint8m2_t v_mask8 = __riscv_vle8_v_u8m2(&mask_buf[x], vl); + + /* Compute mix = (mask * opa) >> 8 using widening multiply */ + vuint16m4_t v_mix16 = __riscv_vsrl_vx_u16m4( + __riscv_vwmulu_vx_u16m4(v_mask8, opa, vl), 8, vl); + vuint8m2_t v_mix8 = __riscv_vnsrl_wx_u8m2(v_mix16, 0, vl); + + /* Load destination B, G, R, X channels using segmented load */ + vuint8m2_t v_dst_b, v_dst_g, v_dst_r, v_dst_x; + LV_RVV_VLSEG4E8_U8M2(dest_buf + x * 4, vl, v_dst_b, v_dst_g, v_dst_r, v_dst_x); + (void)v_dst_x; /* v_dst_x is X/Alpha, ignored for input */ + + /* Blend solid color with mix (mask * opa) */ + vuint8m2_t v_b, v_g, v_r; + LV_RVV_BLEND_SOLID_RGB_VMASK_U8M2(v_dst_r, v_dst_g, v_dst_b, + fg_r, fg_g, fg_b, v_mix8, + v_r, v_g, v_b, vl); + + LV_RVV_BLEND_OPTIMIZE_MASK_SCALAR_U8M2(v_r, v_g, v_b, + fg_r, fg_g, fg_b, + v_dst_r, v_dst_g, v_dst_b, + v_mix8, vl); + + vuint8m2_t v_x = __riscv_vmv_v_x_u8m2(0xFF, vl); /* Alpha = 0xFF */ + + /* Store result using segmented store */ + LV_RVV_VSSEG4E8_U8M2(dest_buf + x * 4, v_b, v_g, v_r, v_x, vl); + } + + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + mask_buf += mask_stride; + } + } + + return LV_RESULT_OK; +} + +/********************** + * RGB565 TO RGB888 BLEND FUNCTIONS + **********************/ + +/** + * RGB565 to RGB888/XRGB8888 simple copy (no blending, opa >= 255) + * RGB565 format: RRRRRGGGGGGBBBBB (5-6-5 bits) + */ +lv_result_t lv_draw_sw_blend_riscv_v_rgb565_to_rgb888(lv_draw_sw_blend_image_dsc_t * dsc, uint32_t dest_px_size) +{ + LV_ASSERT(dest_px_size == 3 || dest_px_size == 4); + LV_ASSERT(dsc->opa >= LV_OPA_MAX); + LV_ASSERT(dsc->mask_buf == NULL); + + const int32_t w = dsc->dest_w; + const int32_t h = dsc->dest_h; + const int32_t dest_stride = dsc->dest_stride; + const int32_t src_stride = dsc->src_stride; + uint8_t * dest_buf = dsc->dest_buf; + const uint16_t * src_buf = dsc->src_buf; + size_t vl; + + if(dest_px_size == 3) { + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e16m2(w - x); + + /* Load RGB565 pixels */ + vuint16m2_t v_rgb565 = __riscv_vle16_v_u16m2(&src_buf[x], vl); + + /* Extract R5, G6, B5 components */ + vuint16m2_t v_r5 = __riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2(v_rgb565, 11, vl), 0x1F, vl); + vuint16m2_t v_g6 = __riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2(v_rgb565, 5, vl), 0x3F, vl); + vuint16m2_t v_b5 = __riscv_vand_vx_u16m2(v_rgb565, 0x1F, vl); + + /* Convert to 8-bit: R8 = (R5 * 2106) >> 8, G8 = (G6 * 1037) >> 8, B8 = (B5 * 2106) >> 8 */ + vuint16m2_t v_r8_16 = __riscv_vsrl_vx_u16m2(__riscv_vmul_vx_u16m2(v_r5, 2106, vl), 8, vl); + vuint16m2_t v_g8_16 = __riscv_vsrl_vx_u16m2(__riscv_vmul_vx_u16m2(v_g6, 1037, vl), 8, vl); + vuint16m2_t v_b8_16 = __riscv_vsrl_vx_u16m2(__riscv_vmul_vx_u16m2(v_b5, 2106, vl), 8, vl); + + /* Narrow to 8-bit */ + vuint8m1_t v_r = __riscv_vnsrl_wx_u8m1(v_r8_16, 0, vl); + vuint8m1_t v_g = __riscv_vnsrl_wx_u8m1(v_g8_16, 0, vl); + vuint8m1_t v_b = __riscv_vnsrl_wx_u8m1(v_b8_16, 0, vl); + + /* Store using stride store for RGB888 (3 bytes per pixel) */ + LV_RVV_STORE_RGB888_U8M1(dest_buf, x, v_b, v_g, v_r, vl); + } + + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + } + } + else { /* dest_px_size == 4 */ + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e16m2(w - x); + + /* Load RGB565 pixels */ + vuint16m2_t v_rgb565 = __riscv_vle16_v_u16m2(&src_buf[x], vl); + + /* Extract R5, G6, B5 components */ + vuint16m2_t v_r5 = __riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2(v_rgb565, 11, vl), 0x1F, vl); + vuint16m2_t v_g6 = __riscv_vand_vx_u16m2(__riscv_vsrl_vx_u16m2(v_rgb565, 5, vl), 0x3F, vl); + vuint16m2_t v_b5 = __riscv_vand_vx_u16m2(v_rgb565, 0x1F, vl); + + /* Convert to 8-bit */ + vuint16m2_t v_r8_16 = __riscv_vsrl_vx_u16m2(__riscv_vmul_vx_u16m2(v_r5, 2106, vl), 8, vl); + vuint16m2_t v_g8_16 = __riscv_vsrl_vx_u16m2(__riscv_vmul_vx_u16m2(v_g6, 1037, vl), 8, vl); + vuint16m2_t v_b8_16 = __riscv_vsrl_vx_u16m2(__riscv_vmul_vx_u16m2(v_b5, 2106, vl), 8, vl); + + /* Narrow to 8-bit */ + vuint8m1_t v_r = __riscv_vnsrl_wx_u8m1(v_r8_16, 0, vl); + vuint8m1_t v_g = __riscv_vnsrl_wx_u8m1(v_g8_16, 0, vl); + vuint8m1_t v_b = __riscv_vnsrl_wx_u8m1(v_b8_16, 0, vl); + vuint8m1_t v_a = __riscv_vmv_v_x_u8m1(0xFF, vl); + + /* Store using stride store for XRGB8888 (4 bytes per pixel) */ + LV_RVV_STORE_XRGB8888_U8M1(dest_buf, x, v_b, v_g, v_r, v_a, vl); + } + + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + } + } + + return LV_RESULT_OK; +} + +/** + * RGB565 to RGB888/XRGB8888 with opacity + * blend formula: result = (src * opa + dst * (255 - opa)) >> 8 + * Optimized using vwmaccu for blend calculation + */ +lv_result_t lv_draw_sw_blend_riscv_v_rgb565_to_rgb888_with_opa(lv_draw_sw_blend_image_dsc_t * dsc, + uint32_t dest_px_size) +{ + LV_ASSERT(dest_px_size == 3 || dest_px_size == 4); + LV_ASSERT(dsc->opa < LV_OPA_MAX); + LV_ASSERT(dsc->mask_buf == NULL); + + const int32_t w = dsc->dest_w; + const int32_t h = dsc->dest_h; + const int32_t dest_stride = dsc->dest_stride; + const int32_t src_stride = dsc->src_stride; + const uint8_t opa = dsc->opa; + uint8_t * dest_buf = dsc->dest_buf; + const uint16_t * src_buf = dsc->src_buf; + size_t vl; + + if(dest_px_size == 3) { + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + + /* Load RGB565 source pixels */ + vuint16m2_t v_rgb565 = __riscv_vle16_v_u16m2(&src_buf[x], vl); + + /* Convert RGB565 to RGB888 using macro, then narrow to 8-bit */ + vuint16m2_t v_src_r16, v_src_g16, v_src_b16; + LV_RVV_RGB565_TO_RGB888_U16M2(v_rgb565, v_src_r16, v_src_g16, v_src_b16, vl); + vuint8m1_t v_src_r = __riscv_vnsrl_wx_u8m1(v_src_r16, 0, vl); + vuint8m1_t v_src_g = __riscv_vnsrl_wx_u8m1(v_src_g16, 0, vl); + vuint8m1_t v_src_b = __riscv_vnsrl_wx_u8m1(v_src_b16, 0, vl); + + /* Load destination RGB888 using stride load */ + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_RGB888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + + /* Blend using vwmaccu */ + vuint8m1_t v_r, v_g, v_b; + LV_RVV_BLEND_RGB_U8M1(v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + opa, v_r, v_g, v_b, vl); + + /* Store result */ + LV_RVV_STORE_RGB888_U8M1(dest_buf, x, v_b, v_g, v_r, vl); + } + + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + } + } + else { /* dest_px_size == 4 */ + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + + /* Load RGB565 source pixels */ + vuint16m2_t v_rgb565 = __riscv_vle16_v_u16m2(&src_buf[x], vl); + + /* Convert RGB565 to RGB888 using macro, then narrow to 8-bit */ + vuint16m2_t v_src_r16, v_src_g16, v_src_b16; + LV_RVV_RGB565_TO_RGB888_U16M2(v_rgb565, v_src_r16, v_src_g16, v_src_b16, vl); + vuint8m1_t v_src_r = __riscv_vnsrl_wx_u8m1(v_src_r16, 0, vl); + vuint8m1_t v_src_g = __riscv_vnsrl_wx_u8m1(v_src_g16, 0, vl); + vuint8m1_t v_src_b = __riscv_vnsrl_wx_u8m1(v_src_b16, 0, vl); + + /* Load destination XRGB8888 using stride load */ + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_XRGB8888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + + /* Blend using vwmaccu */ + vuint8m1_t v_r, v_g, v_b; + LV_RVV_BLEND_RGB_U8M1(v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + opa, v_r, v_g, v_b, vl); + vuint8m1_t v_a = __riscv_vmv_v_x_u8m1(0xFF, vl); + + /* Store result */ + LV_RVV_STORE_XRGB8888_U8M1(dest_buf, x, v_b, v_g, v_r, v_a, vl); + } + + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + } + } + + return LV_RESULT_OK; +} + +/** + * RGB565 to RGB888/XRGB8888 with per-pixel mask + * blend formula: result = (src * mask + dst * (255 - mask)) >> 8 + * Optimized using vwmaccu for blend calculation + */ +lv_result_t lv_draw_sw_blend_riscv_v_rgb565_to_rgb888_with_mask(lv_draw_sw_blend_image_dsc_t * dsc, + uint32_t dest_px_size) +{ + LV_ASSERT(dest_px_size == 3 || dest_px_size == 4); + LV_ASSERT(dsc->opa >= LV_OPA_MAX); + LV_ASSERT(dsc->mask_buf != NULL); + + const int32_t w = dsc->dest_w; + const int32_t h = dsc->dest_h; + const int32_t dest_stride = dsc->dest_stride; + const int32_t src_stride = dsc->src_stride; + const int32_t mask_stride = dsc->mask_stride; + uint8_t * dest_buf = dsc->dest_buf; + const uint16_t * src_buf = dsc->src_buf; + const uint8_t * mask_buf = dsc->mask_buf; + size_t vl; + + if(dest_px_size == 3) { + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + + /* Load mask */ + vuint8m1_t v_mask = __riscv_vle8_v_u8m1(&mask_buf[x], vl); + + /* Load RGB565 source */ + vuint16m2_t v_rgb565 = __riscv_vle16_v_u16m2(&src_buf[x], vl); + + /* Convert RGB565 to RGB888 and narrow to 8-bit */ + vuint16m2_t v_src_r16, v_src_g16, v_src_b16; + LV_RVV_RGB565_TO_RGB888_U16M2(v_rgb565, v_src_r16, v_src_g16, v_src_b16, vl); + vuint8m1_t v_src_r = __riscv_vnsrl_wx_u8m1(v_src_r16, 0, vl); + vuint8m1_t v_src_g = __riscv_vnsrl_wx_u8m1(v_src_g16, 0, vl); + vuint8m1_t v_src_b = __riscv_vnsrl_wx_u8m1(v_src_b16, 0, vl); + + /* Load destination */ + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_RGB888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + + /* Blend with mask using vwmaccu */ + vuint8m1_t v_r, v_g, v_b; + LV_RVV_BLEND_RGB_VMASK_U8M1(v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_mask, v_r, v_g, v_b, vl); + LV_RVV_BLEND_OPTIMIZE_MASK_U8M1(v_r, v_g, v_b, + v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_mask, vl); + LV_RVV_STORE_RGB888_U8M1(dest_buf, x, v_b, v_g, v_r, vl); + } + + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + mask_buf += mask_stride; + } + } + else { /* dest_px_size == 4 */ + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + + /* Load mask */ + vuint8m1_t v_mask = __riscv_vle8_v_u8m1(&mask_buf[x], vl); + + /* Load RGB565 source */ + vuint16m2_t v_rgb565 = __riscv_vle16_v_u16m2(&src_buf[x], vl); + + /* Convert RGB565 to RGB888 and narrow to 8-bit */ + vuint16m2_t v_src_r16, v_src_g16, v_src_b16; + LV_RVV_RGB565_TO_RGB888_U16M2(v_rgb565, v_src_r16, v_src_g16, v_src_b16, vl); + vuint8m1_t v_src_r = __riscv_vnsrl_wx_u8m1(v_src_r16, 0, vl); + vuint8m1_t v_src_g = __riscv_vnsrl_wx_u8m1(v_src_g16, 0, vl); + vuint8m1_t v_src_b = __riscv_vnsrl_wx_u8m1(v_src_b16, 0, vl); + + /* Load destination */ + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_XRGB8888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + + /* Blend with mask using vwmaccu */ + vuint8m1_t v_r, v_g, v_b; + LV_RVV_BLEND_RGB_VMASK_U8M1(v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_mask, v_r, v_g, v_b, vl); + + LV_RVV_BLEND_OPTIMIZE_MASK_U8M1(v_r, v_g, v_b, + v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_mask, vl); + + vuint8m1_t v_a = __riscv_vmv_v_x_u8m1(0xFF, vl); + + LV_RVV_STORE_XRGB8888_U8M1(dest_buf, x, v_b, v_g, v_r, v_a, vl); + } + + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + mask_buf += mask_stride; + } + } + + return LV_RESULT_OK; +} + +/** + * RGB565 to RGB888/XRGB8888 with opacity and per-pixel mask + * effective mix = (mask * opa) >> 8 + * blend formula: result = (src * mix + dst * (255 - mix)) >> 8 + * + * Note: with_opa_mask needs 16-bit intermediate for mix calculation, + * so we cannot directly use the vwmaccu optimization for this case. + */ +lv_result_t lv_draw_sw_blend_riscv_v_rgb565_to_rgb888_with_opa_mask(lv_draw_sw_blend_image_dsc_t * dsc, + uint32_t dest_px_size) +{ + LV_ASSERT(dest_px_size == 3 || dest_px_size == 4); + LV_ASSERT(dsc->opa < LV_OPA_MAX); + LV_ASSERT(dsc->mask_buf != NULL); + + const int32_t w = dsc->dest_w; + const int32_t h = dsc->dest_h; + const int32_t dest_stride = dsc->dest_stride; + const int32_t src_stride = dsc->src_stride; + const int32_t mask_stride = dsc->mask_stride; + const uint8_t opa = dsc->opa; + uint8_t * dest_buf = dsc->dest_buf; + const uint16_t * src_buf = dsc->src_buf; + const uint8_t * mask_buf = dsc->mask_buf; + size_t vl; + + if(dest_px_size == 3) { + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + + /* Load mask and compute effective mix = (mask * opa) >> 8 */ + vuint8m1_t v_mask = __riscv_vle8_v_u8m1(&mask_buf[x], vl); + vuint16m2_t v_mask16 = __riscv_vzext_vf2_u16m2(v_mask, vl); + vuint16m2_t v_mix16 = __riscv_vsrl_vx_u16m2(__riscv_vmul_vx_u16m2(v_mask16, opa, vl), 8, vl); + vuint8m1_t v_mix = __riscv_vnsrl_wx_u8m1(v_mix16, 0, vl); + + /* Load RGB565 source */ + vuint16m2_t v_rgb565 = __riscv_vle16_v_u16m2(&src_buf[x], vl); + + /* Convert RGB565 to RGB888 and narrow to 8-bit */ + vuint16m2_t v_src_r16, v_src_g16, v_src_b16; + LV_RVV_RGB565_TO_RGB888_U16M2(v_rgb565, v_src_r16, v_src_g16, v_src_b16, vl); + vuint8m1_t v_src_r = __riscv_vnsrl_wx_u8m1(v_src_r16, 0, vl); + vuint8m1_t v_src_g = __riscv_vnsrl_wx_u8m1(v_src_g16, 0, vl); + vuint8m1_t v_src_b = __riscv_vnsrl_wx_u8m1(v_src_b16, 0, vl); + + /* Load destination */ + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_RGB888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + + /* Blend with effective mix using vwmaccu */ + vuint8m1_t v_r, v_g, v_b; + LV_RVV_BLEND_RGB_VMASK_U8M1(v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_mix, v_r, v_g, v_b, vl); + + LV_RVV_BLEND_OPTIMIZE_MASK_U8M1(v_r, v_g, v_b, + v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_mix, vl); + + LV_RVV_STORE_RGB888_U8M1(dest_buf, x, v_b, v_g, v_r, vl); + } + + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + mask_buf += mask_stride; + } + } + else { /* dest_px_size == 4 */ + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + + /* Load mask and compute effective mix */ + vuint8m1_t v_mask = __riscv_vle8_v_u8m1(&mask_buf[x], vl); + vuint16m2_t v_mask16 = __riscv_vzext_vf2_u16m2(v_mask, vl); + vuint16m2_t v_mix16 = __riscv_vsrl_vx_u16m2(__riscv_vmul_vx_u16m2(v_mask16, opa, vl), 8, vl); + vuint8m1_t v_mix = __riscv_vnsrl_wx_u8m1(v_mix16, 0, vl); + + /* Load RGB565 source */ + vuint16m2_t v_rgb565 = __riscv_vle16_v_u16m2(&src_buf[x], vl); + + /* Convert RGB565 to RGB888 and narrow to 8-bit */ + vuint16m2_t v_src_r16, v_src_g16, v_src_b16; + LV_RVV_RGB565_TO_RGB888_U16M2(v_rgb565, v_src_r16, v_src_g16, v_src_b16, vl); + vuint8m1_t v_src_r = __riscv_vnsrl_wx_u8m1(v_src_r16, 0, vl); + vuint8m1_t v_src_g = __riscv_vnsrl_wx_u8m1(v_src_g16, 0, vl); + vuint8m1_t v_src_b = __riscv_vnsrl_wx_u8m1(v_src_b16, 0, vl); + + /* Load destination */ + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_XRGB8888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + + /* Blend with effective mix using vwmaccu */ + vuint8m1_t v_r, v_g, v_b; + LV_RVV_BLEND_RGB_VMASK_U8M1(v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_mix, v_r, v_g, v_b, vl); + LV_RVV_BLEND_OPTIMIZE_MASK_U8M1(v_r, v_g, v_b, + v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_mix, vl); + vuint8m1_t v_a = __riscv_vmv_v_x_u8m1(0xFF, vl); + + LV_RVV_STORE_XRGB8888_U8M1(dest_buf, x, v_b, v_g, v_r, v_a, vl); + } + + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + mask_buf += mask_stride; + } + } + + return LV_RESULT_OK; +} + +/********************** + * RGB888/XRGB8888 TO RGB888/XRGB8888 BLEND FUNCTIONS + **********************/ + +/** + * RGB888/XRGB8888 to RGB888/XRGB8888 simple copy (no blending, opa >= 255) + * src_px_size: 3 for RGB888, 4 for XRGB8888 + * dest_px_size: 3 for RGB888, 4 for XRGB8888 + */ +lv_result_t lv_draw_sw_blend_riscv_v_rgb888_to_rgb888(lv_draw_sw_blend_image_dsc_t * dsc, + uint32_t dest_px_size, uint32_t src_px_size) +{ + LV_ASSERT(dest_px_size == 3 || dest_px_size == 4); + LV_ASSERT(src_px_size == 3 || src_px_size == 4); + LV_ASSERT(dsc->opa >= LV_OPA_MAX); + LV_ASSERT(dsc->mask_buf == NULL); + + const int32_t w = dsc->dest_w; + const int32_t h = dsc->dest_h; + const int32_t dest_stride = dsc->dest_stride; + const int32_t src_stride = dsc->src_stride; + uint8_t * dest_buf = dsc->dest_buf; + const uint8_t * src_buf = dsc->src_buf; + size_t vl; + + /* Fast path: same pixel size, use RVV memcpy */ + if(src_px_size == dest_px_size) { + const int32_t row_bytes = w * dest_px_size; + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < row_bytes; x += vl) { + vl = __riscv_vsetvl_e8m8(row_bytes - x); + vuint8m8_t v_data = __riscv_vle8_v_u8m8(src_buf + x, vl); + __riscv_vse8_v_u8m8(dest_buf + x, v_data, vl); + } + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + } + return LV_RESULT_OK; + } + + /* Different pixel sizes: need per-pixel conversion */ + if(dest_px_size == 3) { + /* Source: XRGB8888 -> RGB888 */ + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + vuint8m1_t v_src_b, v_src_g, v_src_r; + LV_RVV_LOAD_XRGB8888_U8M1(src_buf, x, v_src_b, v_src_g, v_src_r, vl); + LV_RVV_STORE_RGB888_U8M1(dest_buf, x, v_src_b, v_src_g, v_src_r, vl); + } + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + } + } + else { + /* Destination: XRGB8888 */ + size_t max_vl = __riscv_vsetvlmax_e8m1(); + vuint8m1_t v_a = __riscv_vmv_v_x_u8m1(0xFF, max_vl); + + /* Source: RGB888 -> XRGB8888 */ + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + vuint8m1_t v_src_b, v_src_g, v_src_r; + LV_RVV_LOAD_RGB888_U8M1(src_buf, x, v_src_b, v_src_g, v_src_r, vl); + LV_RVV_STORE_XRGB8888_U8M1(dest_buf, x, v_src_b, v_src_g, v_src_r, v_a, vl); + } + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + } + } + + return LV_RESULT_OK; +} + +/** + * RGB888/XRGB8888 to RGB888/XRGB8888 with opacity + * blend formula: result = (src * opa + dst * (255 - opa)) >> 8 + */ +lv_result_t lv_draw_sw_blend_riscv_v_rgb888_to_rgb888_with_opa(lv_draw_sw_blend_image_dsc_t * dsc, + uint32_t dest_px_size, uint32_t src_px_size) +{ + LV_ASSERT(dest_px_size == 3 || dest_px_size == 4); + LV_ASSERT(src_px_size == 3 || src_px_size == 4); + LV_ASSERT(dsc->opa < LV_OPA_MAX); + LV_ASSERT(dsc->mask_buf == NULL); + + const int32_t w = dsc->dest_w; + const int32_t h = dsc->dest_h; + const int32_t dest_stride = dsc->dest_stride; + const int32_t src_stride = dsc->src_stride; + const uint8_t opa = dsc->opa; + uint8_t * dest_buf = dsc->dest_buf; + const uint8_t * src_buf = dsc->src_buf; + size_t vl; + + if(dest_px_size == 3) { + if(src_px_size == 3) { + /* RGB888 -> RGB888 */ + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + vuint8m1_t v_src_b, v_src_g, v_src_r; + LV_RVV_LOAD_RGB888_U8M1(src_buf, x, v_src_b, v_src_g, v_src_r, vl); + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_RGB888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + vuint8m1_t v_r, v_g, v_b; + LV_RVV_BLEND_RGB_U8M1(v_src_r, v_src_g, v_src_b, v_dst_r, v_dst_g, v_dst_b, opa, v_r, v_g, v_b, vl); + LV_RVV_STORE_RGB888_U8M1(dest_buf, x, v_b, v_g, v_r, vl); + } + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + } + } + else { + /* XRGB8888 -> RGB888 */ + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + vuint8m1_t v_src_b, v_src_g, v_src_r; + LV_RVV_LOAD_XRGB8888_U8M1(src_buf, x, v_src_b, v_src_g, v_src_r, vl); + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_RGB888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + vuint8m1_t v_r, v_g, v_b; + LV_RVV_BLEND_RGB_U8M1(v_src_r, v_src_g, v_src_b, v_dst_r, v_dst_g, v_dst_b, opa, v_r, v_g, v_b, vl); + LV_RVV_STORE_RGB888_U8M1(dest_buf, x, v_b, v_g, v_r, vl); + } + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + } + } + } + else { + size_t max_vl = __riscv_vsetvlmax_e8m1(); + vuint8m1_t v_a = __riscv_vmv_v_x_u8m1(0xFF, max_vl); + if(src_px_size == 3) { + /* RGB888 -> XRGB8888 */ + for(int32_t y = 0; y < h; y++) { + uint8_t * dest_row = dest_buf; + const uint8_t * src_row = src_buf; + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + vuint8m1_t v_src_b, v_src_g, v_src_r; + LV_RVV_LOAD_RGB888_U8M1(src_row, x, v_src_b, v_src_g, v_src_r, vl); + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_XRGB8888_U8M1(dest_row, x, v_dst_b, v_dst_g, v_dst_r, vl); + vuint8m1_t v_r, v_g, v_b; + LV_RVV_BLEND_RGB_U8M1(v_src_r, v_src_g, v_src_b, v_dst_r, v_dst_g, v_dst_b, opa, v_r, v_g, v_b, vl); + LV_RVV_STORE_XRGB8888_U8M1(dest_row, x, v_b, v_g, v_r, v_a, vl); + } + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + } + } + else { + /* XRGB8888 -> XRGB8888 */ + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + vuint8m1_t v_src_b, v_src_g, v_src_r; + LV_RVV_LOAD_XRGB8888_U8M1(src_buf, x, v_src_b, v_src_g, v_src_r, vl); + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_XRGB8888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + vuint8m1_t v_r, v_g, v_b; + LV_RVV_BLEND_RGB_U8M1(v_src_r, v_src_g, v_src_b, v_dst_r, v_dst_g, v_dst_b, opa, v_r, v_g, v_b, vl); + LV_RVV_STORE_XRGB8888_U8M1(dest_buf, x, v_b, v_g, v_r, v_a, vl); + } + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + } + } + } + + return LV_RESULT_OK; +} + +/** + * RGB888/XRGB8888 to RGB888/XRGB8888 with per-pixel mask + * blend formula: result = (src * mask + dst * (255 - mask)) >> 8 + */ +lv_result_t lv_draw_sw_blend_riscv_v_rgb888_to_rgb888_with_mask(lv_draw_sw_blend_image_dsc_t * dsc, + uint32_t dest_px_size, uint32_t src_px_size) +{ + LV_ASSERT(dest_px_size == 3 || dest_px_size == 4); + LV_ASSERT(src_px_size == 3 || src_px_size == 4); + LV_ASSERT(dsc->opa >= LV_OPA_MAX); + LV_ASSERT(dsc->mask_buf != NULL); + + const int32_t w = dsc->dest_w; + const int32_t h = dsc->dest_h; + const int32_t dest_stride = dsc->dest_stride; + const int32_t src_stride = dsc->src_stride; + const int32_t mask_stride = dsc->mask_stride; + uint8_t * dest_buf = dsc->dest_buf; + const uint8_t * src_buf = dsc->src_buf; + const uint8_t * mask_buf = dsc->mask_buf; + size_t vl; + + if(dest_px_size == 3) { + if(src_px_size == 3) { + /* RGB888 -> RGB888 */ + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + vuint8m1_t v_mask = __riscv_vle8_v_u8m1(&mask_buf[x], vl); + vuint8m1_t v_src_b, v_src_g, v_src_r; + LV_RVV_LOAD_RGB888_U8M1(src_buf, x, v_src_b, v_src_g, v_src_r, vl); + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_RGB888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + vuint8m1_t v_r, v_g, v_b; + LV_RVV_BLEND_RGB_VMASK_U8M1(v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_mask, + v_r, v_g, v_b, + vl); + LV_RVV_BLEND_OPTIMIZE_MASK_U8M1(v_r, v_g, v_b, + v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_mask, vl); + + LV_RVV_STORE_RGB888_U8M1(dest_buf, x, v_b, v_g, v_r, vl); + } + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + mask_buf += mask_stride; + } + } + else { + /* XRGB8888 -> RGB888 */ + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + vuint8m1_t v_mask = __riscv_vle8_v_u8m1(&mask_buf[x], vl); + vuint8m1_t v_src_b, v_src_g, v_src_r; + LV_RVV_LOAD_XRGB8888_U8M1(src_buf, x, v_src_b, v_src_g, v_src_r, vl); + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_RGB888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + vuint8m1_t v_r, v_g, v_b; + LV_RVV_BLEND_RGB_VMASK_U8M1(v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_mask, + v_r, v_g, v_b, + vl); + LV_RVV_BLEND_OPTIMIZE_MASK_U8M1(v_r, v_g, v_b, + v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_mask, vl); + LV_RVV_STORE_RGB888_U8M1(dest_buf, x, v_b, v_g, v_r, vl); + } + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + mask_buf += mask_stride; + } + } + } + else { + size_t max_vl = __riscv_vsetvlmax_e8m1(); + vuint8m1_t v_a = __riscv_vmv_v_x_u8m1(0xFF, max_vl); + if(src_px_size == 3) { + /* RGB888 -> XRGB8888 */ + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + vuint8m1_t v_mask = __riscv_vle8_v_u8m1(&mask_buf[x], vl); + vuint8m1_t v_src_b, v_src_g, v_src_r; + LV_RVV_LOAD_RGB888_U8M1(src_buf, x, v_src_b, v_src_g, v_src_r, vl); + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_XRGB8888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + vuint8m1_t v_r, v_g, v_b; + LV_RVV_BLEND_RGB_VMASK_U8M1(v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_mask, + v_r, v_g, v_b, + vl); + LV_RVV_BLEND_OPTIMIZE_MASK_U8M1(v_r, v_g, v_b, + v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_mask, vl); + LV_RVV_STORE_XRGB8888_U8M1(dest_buf, x, v_b, v_g, v_r, v_a, vl); + } + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + mask_buf += mask_stride; + } + } + else { + /* XRGB8888 -> XRGB8888 */ + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + vuint8m1_t v_mask = __riscv_vle8_v_u8m1(&mask_buf[x], vl); + vuint8m1_t v_src_b, v_src_g, v_src_r; + LV_RVV_LOAD_XRGB8888_U8M1(src_buf, x, v_src_b, v_src_g, v_src_r, vl); + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_XRGB8888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + vuint8m1_t v_r, v_g, v_b; + LV_RVV_BLEND_RGB_VMASK_U8M1(v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_mask, v_r, v_g, v_b, vl); + LV_RVV_BLEND_OPTIMIZE_MASK_U8M1(v_r, v_g, v_b, + v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_mask, vl); + LV_RVV_STORE_XRGB8888_U8M1(dest_buf, x, v_b, v_g, v_r, v_a, vl); + } + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + mask_buf += mask_stride; + } + } + } + + return LV_RESULT_OK; +} + +/** + * RGB888/XRGB8888 to RGB888/XRGB8888 with opacity and per-pixel mask + * effective mix = (mask * opa) >> 8 + * blend formula: result = (src * mix + dst * (255 - mix)) >> 8 + */ +lv_result_t lv_draw_sw_blend_riscv_v_rgb888_to_rgb888_with_opa_mask(lv_draw_sw_blend_image_dsc_t * dsc, + uint32_t dest_px_size, uint32_t src_px_size) +{ + LV_ASSERT(dest_px_size == 3 || dest_px_size == 4); + LV_ASSERT(src_px_size == 3 || src_px_size == 4); + LV_ASSERT(dsc->opa < LV_OPA_MAX); + LV_ASSERT(dsc->mask_buf != NULL); + + const int32_t w = dsc->dest_w; + const int32_t h = dsc->dest_h; + const int32_t dest_stride = dsc->dest_stride; + const int32_t src_stride = dsc->src_stride; + const int32_t mask_stride = dsc->mask_stride; + const uint8_t opa = dsc->opa; + uint8_t * dest_buf = dsc->dest_buf; + const uint8_t * src_buf = dsc->src_buf; + const uint8_t * mask_buf = dsc->mask_buf; + size_t vl; + + if(dest_px_size == 3) { + if(src_px_size == 3) { + /* RGB888 -> RGB888 */ + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + vuint8m1_t v_mask = __riscv_vle8_v_u8m1(&mask_buf[x], vl); + vuint16m2_t v_mask16 = __riscv_vzext_vf2_u16m2(v_mask, vl); + vuint16m2_t v_mix16 = __riscv_vsrl_vx_u16m2(__riscv_vmul_vx_u16m2(v_mask16, opa, vl), 8, vl); + vuint8m1_t v_mix = __riscv_vnsrl_wx_u8m1(v_mix16, 0, vl); + vuint8m1_t v_src_b, v_src_g, v_src_r; + LV_RVV_LOAD_RGB888_U8M1(src_buf, x, v_src_b, v_src_g, v_src_r, vl); + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_RGB888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + vuint8m1_t v_r, v_g, v_b; + LV_RVV_BLEND_RGB_VMASK_U8M1(v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_mix, v_r, v_g, v_b, vl); + LV_RVV_BLEND_OPTIMIZE_MASK_U8M1(v_r, v_g, v_b, + v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_mix, vl); + LV_RVV_STORE_RGB888_U8M1(dest_buf, x, v_b, v_g, v_r, vl); + } + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + mask_buf += mask_stride; + } + } + else { + /* XRGB8888 -> RGB888 */ + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + vuint8m1_t v_mask = __riscv_vle8_v_u8m1(&mask_buf[x], vl); + vuint16m2_t v_mask16 = __riscv_vzext_vf2_u16m2(v_mask, vl); + vuint16m2_t v_mix16 = __riscv_vsrl_vx_u16m2(__riscv_vmul_vx_u16m2(v_mask16, opa, vl), 8, vl); + vuint8m1_t v_mix = __riscv_vnsrl_wx_u8m1(v_mix16, 0, vl); + vuint8m1_t v_src_b, v_src_g, v_src_r; + LV_RVV_LOAD_XRGB8888_U8M1(src_buf, x, v_src_b, v_src_g, v_src_r, vl); + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_RGB888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + vuint8m1_t v_r, v_g, v_b; + LV_RVV_BLEND_RGB_VMASK_U8M1(v_src_r, v_src_g, v_src_b, v_dst_r, v_dst_g, v_dst_b, v_mix, v_r, v_g, v_b, vl); + LV_RVV_BLEND_OPTIMIZE_MASK_U8M1(v_r, v_g, v_b, + v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_mix, vl); + LV_RVV_STORE_RGB888_U8M1(dest_buf, x, v_b, v_g, v_r, vl); + } + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + mask_buf += mask_stride; + } + } + } + else { + size_t max_vl = __riscv_vsetvlmax_e8m1(); + vuint8m1_t v_a = __riscv_vmv_v_x_u8m1(0xFF, max_vl); + if(src_px_size == 3) { + /* RGB888 -> XRGB8888 */ + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + vuint8m1_t v_mask = __riscv_vle8_v_u8m1(&mask_buf[x], vl); + vuint16m2_t v_mask16 = __riscv_vzext_vf2_u16m2(v_mask, vl); + vuint16m2_t v_mix16 = __riscv_vsrl_vx_u16m2(__riscv_vmul_vx_u16m2(v_mask16, opa, vl), 8, vl); + vuint8m1_t v_mix = __riscv_vnsrl_wx_u8m1(v_mix16, 0, vl); + vuint8m1_t v_src_b, v_src_g, v_src_r; + LV_RVV_LOAD_RGB888_U8M1(src_buf, x, v_src_b, v_src_g, v_src_r, vl); + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_XRGB8888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + vuint8m1_t v_r, v_g, v_b; + LV_RVV_BLEND_RGB_VMASK_U8M1(v_src_r, v_src_g, v_src_b, v_dst_r, v_dst_g, v_dst_b, v_mix, v_r, v_g, v_b, vl); + LV_RVV_BLEND_OPTIMIZE_MASK_U8M1(v_r, v_g, v_b, + v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_mix, vl); + LV_RVV_STORE_XRGB8888_U8M1(dest_buf, x, v_b, v_g, v_r, v_a, vl); + } + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + mask_buf += mask_stride; + } + } + else { + /* XRGB8888 -> XRGB8888 */ + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + vuint8m1_t v_mask = __riscv_vle8_v_u8m1(&mask_buf[x], vl); + vuint16m2_t v_mask16 = __riscv_vzext_vf2_u16m2(v_mask, vl); + vuint16m2_t v_mix16 = __riscv_vsrl_vx_u16m2(__riscv_vmul_vx_u16m2(v_mask16, opa, vl), 8, vl); + vuint8m1_t v_mix = __riscv_vnsrl_wx_u8m1(v_mix16, 0, vl); + vuint8m1_t v_src_b, v_src_g, v_src_r; + LV_RVV_LOAD_XRGB8888_U8M1(src_buf, x, v_src_b, v_src_g, v_src_r, vl); + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_XRGB8888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + vuint8m1_t v_r, v_g, v_b; + LV_RVV_BLEND_RGB_VMASK_U8M1(v_src_r, v_src_g, v_src_b, v_dst_r, v_dst_g, v_dst_b, v_mix, v_r, v_g, v_b, vl); + LV_RVV_BLEND_OPTIMIZE_MASK_U8M1(v_r, v_g, v_b, + v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_mix, vl); + LV_RVV_STORE_XRGB8888_U8M1(dest_buf, x, v_b, v_g, v_r, v_a, vl); + } + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + mask_buf += mask_stride; + } + } + } + + return LV_RESULT_OK; +} + +/********************** + * ARGB8888 TO RGB888/XRGB8888 BLEND FUNCTIONS + **********************/ + +/** + * ARGB8888 to RGB888/XRGB8888 blend using source alpha + * blend formula: result = (src * src_alpha + dst * (255 - src_alpha)) >> 8 + */ +lv_result_t lv_draw_sw_blend_riscv_v_argb8888_to_rgb888(lv_draw_sw_blend_image_dsc_t * dsc, + uint32_t dest_px_size) +{ + LV_ASSERT(dest_px_size == 3 || dest_px_size == 4); + LV_ASSERT(dsc->opa >= LV_OPA_MAX); + LV_ASSERT(dsc->mask_buf == NULL); + + const int32_t w = dsc->dest_w; + const int32_t h = dsc->dest_h; + const int32_t dest_stride = dsc->dest_stride; + const int32_t src_stride = dsc->src_stride; + uint8_t * dest_buf = dsc->dest_buf; + const uint8_t * src_buf = dsc->src_buf; + size_t vl; + + if(dest_px_size == 3) { + /* ARGB8888 -> RGB888 */ + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + vuint8m1_t v_src_b, v_src_g, v_src_r, v_src_a; + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_ARGB8888_U8M1(src_buf, x, v_src_b, v_src_g, v_src_r, v_src_a, vl); + LV_RVV_LOAD_RGB888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + vuint8m1_t v_r, v_g, v_b; + LV_RVV_BLEND_RGB_VMASK_U8M1(v_src_r, v_src_g, v_src_b, v_dst_r, v_dst_g, v_dst_b, v_src_a, v_r, v_g, v_b, vl); + LV_RVV_BLEND_OPTIMIZE_MASK_U8M1(v_r, v_g, v_b, + v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_src_a, vl); + LV_RVV_STORE_RGB888_U8M1(dest_buf, x, v_b, v_g, v_r, vl); + } + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + } + } + else { + /* ARGB8888 -> XRGB8888 */ + size_t max_vl = __riscv_vsetvlmax_e8m1(); + vuint8m1_t v_a = __riscv_vmv_v_x_u8m1(0xFF, max_vl); + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + vuint8m1_t v_src_b, v_src_g, v_src_r, v_src_a; + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_ARGB8888_U8M1(src_buf, x, v_src_b, v_src_g, v_src_r, v_src_a, vl); + LV_RVV_LOAD_XRGB8888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + vuint8m1_t v_r, v_g, v_b; + LV_RVV_BLEND_RGB_VMASK_U8M1(v_src_r, v_src_g, v_src_b, v_dst_r, v_dst_g, v_dst_b, v_src_a, v_r, v_g, v_b, vl); + LV_RVV_BLEND_OPTIMIZE_MASK_U8M1(v_r, v_g, v_b, + v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_src_a, vl); + LV_RVV_STORE_XRGB8888_U8M1(dest_buf, x, v_b, v_g, v_r, v_a, vl); + } + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + } + } + + return LV_RESULT_OK; +} + +/** + * ARGB8888 to RGB888/XRGB8888 with global opacity + * effective_alpha = (src_alpha * opa) >> 8 + * blend formula: result = (src * effective_alpha + dst * (255 - effective_alpha)) >> 8 + */ +lv_result_t lv_draw_sw_blend_riscv_v_argb8888_to_rgb888_with_opa(lv_draw_sw_blend_image_dsc_t * dsc, + uint32_t dest_px_size) +{ + LV_ASSERT(dest_px_size == 3 || dest_px_size == 4); + LV_ASSERT(dsc->opa < LV_OPA_MAX); + LV_ASSERT(dsc->mask_buf == NULL); + + const int32_t w = dsc->dest_w; + const int32_t h = dsc->dest_h; + const int32_t dest_stride = dsc->dest_stride; + const int32_t src_stride = dsc->src_stride; + const uint8_t opa = dsc->opa; + uint8_t * dest_buf = dsc->dest_buf; + const uint8_t * src_buf = dsc->src_buf; + size_t vl; + + if(dest_px_size == 3) { + /* ARGB8888 -> RGB888 */ + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + vuint8m1_t v_src_b, v_src_g, v_src_r, v_src_a; + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_ARGB8888_U8M1(src_buf, x, v_src_b, v_src_g, v_src_r, v_src_a, vl); + vuint8m1_t v_eff_a; + LV_RVV_CALC_EFF_ALPHA_OPA_U8M1(v_src_a, opa, v_eff_a, vl); + LV_RVV_LOAD_RGB888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + vuint8m1_t v_r, v_g, v_b; + LV_RVV_BLEND_RGB_VMASK_U8M1(v_src_r, v_src_g, v_src_b, v_dst_r, v_dst_g, v_dst_b, v_eff_a, v_r, v_g, v_b, vl); + LV_RVV_BLEND_OPTIMIZE_MASK_U8M1(v_r, v_g, v_b, + v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_eff_a, vl); + LV_RVV_STORE_RGB888_U8M1(dest_buf, x, v_b, v_g, v_r, vl); + } + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + } + } + else { + /* ARGB8888 -> XRGB8888 */ + size_t max_vl = __riscv_vsetvlmax_e8m1(); + vuint8m1_t v_a = __riscv_vmv_v_x_u8m1(0xFF, max_vl); + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + vuint8m1_t v_src_b, v_src_g, v_src_r, v_src_a; + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_ARGB8888_U8M1(src_buf, x, v_src_b, v_src_g, v_src_r, v_src_a, vl); + vuint8m1_t v_eff_a; + LV_RVV_CALC_EFF_ALPHA_OPA_U8M1(v_src_a, opa, v_eff_a, vl); + LV_RVV_LOAD_XRGB8888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + vuint8m1_t v_r, v_g, v_b; + LV_RVV_BLEND_RGB_VMASK_U8M1(v_src_r, v_src_g, v_src_b, v_dst_r, v_dst_g, v_dst_b, v_eff_a, v_r, v_g, v_b, vl); + LV_RVV_BLEND_OPTIMIZE_MASK_U8M1(v_r, v_g, v_b, + v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_eff_a, vl); + LV_RVV_STORE_XRGB8888_U8M1(dest_buf, x, v_b, v_g, v_r, v_a, vl); + } + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + } + } + + return LV_RESULT_OK; +} + +/** + * ARGB8888 to RGB888/XRGB8888 with per-pixel mask + * effective_alpha = (src_alpha * mask) >> 8 + * blend formula: result = (src * effective_alpha + dst * (255 - effective_alpha)) >> 8 + */ +lv_result_t lv_draw_sw_blend_riscv_v_argb8888_to_rgb888_with_mask(lv_draw_sw_blend_image_dsc_t * dsc, + uint32_t dest_px_size) +{ + LV_ASSERT(dest_px_size == 3 || dest_px_size == 4); + LV_ASSERT(dsc->opa >= LV_OPA_MAX); + LV_ASSERT(dsc->mask_buf != NULL); + + const int32_t w = dsc->dest_w; + const int32_t h = dsc->dest_h; + const int32_t dest_stride = dsc->dest_stride; + const int32_t src_stride = dsc->src_stride; + const int32_t mask_stride = dsc->mask_stride; + uint8_t * dest_buf = dsc->dest_buf; + const uint8_t * src_buf = dsc->src_buf; + const uint8_t * mask_buf = dsc->mask_buf; + size_t vl; + + if(dest_px_size == 3) { + /* ARGB8888 -> RGB888 */ + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + vuint8m1_t v_mask = __riscv_vle8_v_u8m1(&mask_buf[x], vl); + vuint8m1_t v_src_b, v_src_g, v_src_r, v_src_a; + LV_RVV_LOAD_ARGB8888_U8M1(src_buf, x, v_src_b, v_src_g, v_src_r, v_src_a, vl); + vuint8m1_t v_eff_a; + LV_RVV_CALC_EFF_ALPHA_MASK_U8M1(v_src_a, v_mask, v_eff_a, vl); + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_RGB888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + vuint8m1_t v_r, v_g, v_b; + LV_RVV_BLEND_RGB_VMASK_U8M1(v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_eff_a, v_r, v_g, v_b, vl); + LV_RVV_BLEND_OPTIMIZE_MASK_U8M1(v_r, v_g, v_b, + v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_eff_a, vl); + LV_RVV_STORE_RGB888_U8M1(dest_buf, x, v_b, v_g, v_r, vl); + } + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + mask_buf += mask_stride; + } + } + else { + /* ARGB8888 -> XRGB8888 */ + size_t max_vl = __riscv_vsetvlmax_e8m1(); + vuint8m1_t v_a = __riscv_vmv_v_x_u8m1(0xFF, max_vl); + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + vuint8m1_t v_mask = __riscv_vle8_v_u8m1(&mask_buf[x], vl); + vuint8m1_t v_src_b, v_src_g, v_src_r, v_src_a; + LV_RVV_LOAD_ARGB8888_U8M1(src_buf, x, v_src_b, v_src_g, v_src_r, v_src_a, vl); + vuint8m1_t v_eff_a; + LV_RVV_CALC_EFF_ALPHA_MASK_U8M1(v_src_a, v_mask, v_eff_a, vl); + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_XRGB8888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + vuint8m1_t v_r, v_g, v_b; + LV_RVV_BLEND_RGB_VMASK_U8M1(v_src_r, v_src_g, v_src_b, v_dst_r, v_dst_g, v_dst_b, v_eff_a, v_r, v_g, v_b, vl); + LV_RVV_BLEND_OPTIMIZE_MASK_U8M1(v_r, v_g, v_b, + v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_eff_a, vl); + LV_RVV_STORE_XRGB8888_U8M1(dest_buf, x, v_b, v_g, v_r, v_a, vl); + } + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + mask_buf += mask_stride; + } + } + + return LV_RESULT_OK; +} + +/** + * ARGB8888 to RGB888/XRGB8888 with opacity and per-pixel mask + * effective_alpha = (src_alpha * mask * opa) >> 16 + * blend formula: result = (src * effective_alpha + dst * (255 - effective_alpha)) >> 8 + */ +lv_result_t lv_draw_sw_blend_riscv_v_argb8888_to_rgb888_with_opa_mask(lv_draw_sw_blend_image_dsc_t * dsc, + uint32_t dest_px_size) +{ + LV_ASSERT(dest_px_size == 3 || dest_px_size == 4); + LV_ASSERT(dsc->opa < LV_OPA_MAX); + LV_ASSERT(dsc->mask_buf != NULL); + + const int32_t w = dsc->dest_w; + const int32_t h = dsc->dest_h; + const int32_t dest_stride = dsc->dest_stride; + const int32_t src_stride = dsc->src_stride; + const int32_t mask_stride = dsc->mask_stride; + const uint8_t opa = dsc->opa; + uint8_t * dest_buf = dsc->dest_buf; + const uint8_t * src_buf = dsc->src_buf; + const uint8_t * mask_buf = dsc->mask_buf; + size_t vl; + + if(dest_px_size == 3) { + /* ARGB8888 -> RGB888 */ + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + vuint8m1_t v_mask = __riscv_vle8_v_u8m1(&mask_buf[x], vl); + vuint8m1_t v_src_b, v_src_g, v_src_r, v_src_a; + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_ARGB8888_U8M1(src_buf, x, v_src_b, v_src_g, v_src_r, v_src_a, vl); + vuint8m1_t v_eff_a; + LV_RVV_CALC_EFF_ALPHA_MASK_OPA_U8M1(v_src_a, v_mask, opa, v_eff_a, vl); + LV_RVV_LOAD_RGB888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + vuint8m1_t v_r, v_g, v_b; + LV_RVV_BLEND_RGB_VMASK_U8M1(v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_eff_a, + v_r, v_g, v_b, + vl); + + LV_RVV_BLEND_OPTIMIZE_MASK_U8M1(v_r, v_g, v_b, + v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_eff_a, vl); + LV_RVV_STORE_RGB888_U8M1(dest_buf, x, v_b, v_g, v_r, vl); + } + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + mask_buf += mask_stride; + } + } + else { + /* ARGB8888 -> XRGB8888 */ + size_t max_vl = __riscv_vsetvlmax_e8m1(); + vuint8m1_t v_a = __riscv_vmv_v_x_u8m1(0xFF, max_vl); + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + vuint8m1_t v_mask = __riscv_vle8_v_u8m1(&mask_buf[x], vl); + vuint8m1_t v_src_b, v_src_g, v_src_r, v_src_a; + LV_RVV_LOAD_ARGB8888_U8M1(src_buf, x, v_src_b, v_src_g, v_src_r, v_src_a, vl); + vuint8m1_t v_eff_a; + LV_RVV_CALC_EFF_ALPHA_MASK_OPA_U8M1(v_src_a, v_mask, opa, v_eff_a, vl); + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_XRGB8888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + vuint8m1_t v_r, v_g, v_b; + LV_RVV_BLEND_RGB_VMASK_U8M1(v_src_r, v_src_g, v_src_b, v_dst_r, v_dst_g, v_dst_b, v_eff_a, v_r, v_g, v_b, vl); + LV_RVV_BLEND_OPTIMIZE_MASK_U8M1(v_r, v_g, v_b, + v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_eff_a, vl); + LV_RVV_STORE_XRGB8888_U8M1(dest_buf, x, v_b, v_g, v_r, v_a, vl); + } + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + mask_buf += mask_stride; + } + } + + return LV_RESULT_OK; +} + +/** + * ARGB8888 premultiplied to RGB888/XRGB8888 + * For premultiplied alpha, source RGB is already multiplied by alpha: + * src_premul = src * src_alpha / 255 + * blend formula: result = src_premul + dst * (255 - src_alpha) / 255 + * = src_premul + (dst * (255 - src_alpha)) >> 8 + */ +lv_result_t lv_draw_sw_blend_riscv_v_argb8888_premultiplied_to_rgb888(lv_draw_sw_blend_image_dsc_t * dsc, + uint32_t dest_px_size) +{ + LV_ASSERT(dest_px_size == 3 || dest_px_size == 4); + + const int32_t w = dsc->dest_w; + const int32_t h = dsc->dest_h; + const int32_t dest_stride = dsc->dest_stride; + const int32_t src_stride = dsc->src_stride; + uint8_t * dest_buf = dsc->dest_buf; + const uint8_t * src_buf = dsc->src_buf; + size_t vl; + + if(dest_px_size == 3) { + /* ARGB8888 premultiplied -> RGB888 */ + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + vuint8m1_t v_src_b, v_src_g, v_src_r, v_src_a; + LV_RVV_LOAD_ARGB8888_U8M1(src_buf, x, v_src_b, v_src_g, v_src_r, v_src_a, vl); + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_RGB888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + vuint8m1_t v_inv_a = __riscv_vrsub_vx_u8m1(v_src_a, 255, vl); + vuint16m2_t v_dst_r16 = __riscv_vzext_vf2_u16m2(v_dst_r, vl); + vuint16m2_t v_dst_g16 = __riscv_vzext_vf2_u16m2(v_dst_g, vl); + vuint16m2_t v_dst_b16 = __riscv_vzext_vf2_u16m2(v_dst_b, vl); + vuint16m2_t v_inv_a16 = __riscv_vzext_vf2_u16m2(v_inv_a, vl); + vuint16m2_t v_r16 = __riscv_vsrl_vx_u16m2(__riscv_vmul_vv_u16m2(v_dst_r16, v_inv_a16, vl), 8, vl); + vuint16m2_t v_g16 = __riscv_vsrl_vx_u16m2(__riscv_vmul_vv_u16m2(v_dst_g16, v_inv_a16, vl), 8, vl); + vuint16m2_t v_b16 = __riscv_vsrl_vx_u16m2(__riscv_vmul_vv_u16m2(v_dst_b16, v_inv_a16, vl), 8, vl); + vuint8m1_t v_r = __riscv_vadd_vv_u8m1(v_src_r, __riscv_vnsrl_wx_u8m1(v_r16, 0, vl), vl); + vuint8m1_t v_g = __riscv_vadd_vv_u8m1(v_src_g, __riscv_vnsrl_wx_u8m1(v_g16, 0, vl), vl); + vuint8m1_t v_b = __riscv_vadd_vv_u8m1(v_src_b, __riscv_vnsrl_wx_u8m1(v_b16, 0, vl), vl); + + LV_RVV_BLEND_OPTIMIZE_MASK_U8M1(v_r, v_g, v_b, + v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_src_a, vl); + + LV_RVV_STORE_RGB888_U8M1(dest_buf, x, v_b, v_g, v_r, vl); + } + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + } + } + else { + /* ARGB8888 premultiplied -> XRGB8888 */ + size_t max_vl = __riscv_vsetvlmax_e8m1(); + vuint8m1_t v_a = __riscv_vmv_v_x_u8m1(0xFF, max_vl); + for(int32_t y = 0; y < h; y++) { + for(int32_t x = 0; x < w; x += vl) { + vl = __riscv_vsetvl_e8m1(w - x); + vuint8m1_t v_src_b, v_src_g, v_src_r, v_src_a; + LV_RVV_LOAD_ARGB8888_U8M1(src_buf, x, v_src_b, v_src_g, v_src_r, v_src_a, vl); + vuint8m1_t v_dst_b, v_dst_g, v_dst_r; + LV_RVV_LOAD_XRGB8888_U8M1(dest_buf, x, v_dst_b, v_dst_g, v_dst_r, vl); + vuint8m1_t v_inv_a = __riscv_vrsub_vx_u8m1(v_src_a, 255, vl); + vuint16m2_t v_dst_r16 = __riscv_vzext_vf2_u16m2(v_dst_r, vl); + vuint16m2_t v_dst_g16 = __riscv_vzext_vf2_u16m2(v_dst_g, vl); + vuint16m2_t v_dst_b16 = __riscv_vzext_vf2_u16m2(v_dst_b, vl); + vuint16m2_t v_inv_a16 = __riscv_vzext_vf2_u16m2(v_inv_a, vl); + vuint16m2_t v_r16 = __riscv_vsrl_vx_u16m2(__riscv_vmul_vv_u16m2(v_dst_r16, v_inv_a16, vl), 8, vl); + vuint16m2_t v_g16 = __riscv_vsrl_vx_u16m2(__riscv_vmul_vv_u16m2(v_dst_g16, v_inv_a16, vl), 8, vl); + vuint16m2_t v_b16 = __riscv_vsrl_vx_u16m2(__riscv_vmul_vv_u16m2(v_dst_b16, v_inv_a16, vl), 8, vl); + vuint8m1_t v_r = __riscv_vadd_vv_u8m1(v_src_r, __riscv_vnsrl_wx_u8m1(v_r16, 0, vl), vl); + vuint8m1_t v_g = __riscv_vadd_vv_u8m1(v_src_g, __riscv_vnsrl_wx_u8m1(v_g16, 0, vl), vl); + vuint8m1_t v_b = __riscv_vadd_vv_u8m1(v_src_b, __riscv_vnsrl_wx_u8m1(v_b16, 0, vl), vl); + + LV_RVV_BLEND_OPTIMIZE_MASK_U8M1(v_r, v_g, v_b, + v_src_r, v_src_g, v_src_b, + v_dst_r, v_dst_g, v_dst_b, + v_src_a, vl); + + LV_RVV_STORE_XRGB8888_U8M1(dest_buf, x, v_b, v_g, v_r, v_a, vl); + } + dest_buf = drawbuf_next_row(dest_buf, dest_stride); + src_buf = drawbuf_next_row(src_buf, src_stride); + } + } + + return LV_RESULT_OK; +} + +#endif /* LV_USE_DRAW_SW_ASM == LV_DRAW_SW_ASM_RISCV_V */ \ No newline at end of file diff --git a/src/draw/sw/blend/riscv_v/lv_draw_sw_blend_riscv_v_to_rgb888.h b/src/draw/sw/blend/riscv_v/lv_draw_sw_blend_riscv_v_to_rgb888.h new file mode 100644 index 0000000000..ae887bf140 --- /dev/null +++ b/src/draw/sw/blend/riscv_v/lv_draw_sw_blend_riscv_v_to_rgb888.h @@ -0,0 +1,170 @@ +/** + * @file lv_draw_sw_blend_riscv_v_to_rgb888.h + */ + +#ifndef LV_DRAW_SW_BLEND_RISCV_V_TO_RGB888_H +#define LV_DRAW_SW_BLEND_RISCV_V_TO_RGB888_H + +#ifdef __cplusplus +extern "C" { +#endif + +/********************* + * INCLUDES + *********************/ + +#include "../../../../lv_conf_internal.h" +#if LV_USE_DRAW_SW_ASM == LV_DRAW_SW_ASM_RISCV_V + +#include "../../../../misc/lv_types.h" +/********************* + * DEFINES + *********************/ + +/* Color fill to RGB888/XRGB8888 */ +#ifndef LV_DRAW_SW_COLOR_BLEND_TO_RGB888 +#define LV_DRAW_SW_COLOR_BLEND_TO_RGB888(dsc, dest_px_size) \ + lv_draw_sw_blend_riscv_v_color_to_rgb888(dsc, dest_px_size) +#endif + +#ifndef LV_DRAW_SW_COLOR_BLEND_TO_RGB888_WITH_OPA +#define LV_DRAW_SW_COLOR_BLEND_TO_RGB888_WITH_OPA(dsc, dest_px_size) \ + lv_draw_sw_blend_riscv_v_color_to_rgb888_with_opa(dsc, dest_px_size) +#endif + +#ifndef LV_DRAW_SW_COLOR_BLEND_TO_RGB888_WITH_MASK +#define LV_DRAW_SW_COLOR_BLEND_TO_RGB888_WITH_MASK(dsc, dest_px_size) \ + lv_draw_sw_blend_riscv_v_color_to_rgb888_with_mask(dsc, dest_px_size) +#endif + +#ifndef LV_DRAW_SW_COLOR_BLEND_TO_RGB888_MIX_MASK_OPA +#define LV_DRAW_SW_COLOR_BLEND_TO_RGB888_MIX_MASK_OPA(dsc, dest_px_size) \ + lv_draw_sw_blend_riscv_v_color_to_rgb888_with_opa_mask(dsc, dest_px_size) +#endif + + +/* RGB565 image blend to RGB888/XRGB8888 */ +#ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888 +#define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888(dsc, dest_px_size) \ + lv_draw_sw_blend_riscv_v_rgb565_to_rgb888(dsc, dest_px_size) +#endif + +#ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_WITH_OPA +#define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_WITH_OPA(dsc, dest_px_size) \ + lv_draw_sw_blend_riscv_v_rgb565_to_rgb888_with_opa(dsc, dest_px_size) +#endif + +#ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_WITH_MASK +#define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_WITH_MASK(dsc, dest_px_size) \ + lv_draw_sw_blend_riscv_v_rgb565_to_rgb888_with_mask(dsc, dest_px_size) +#endif + +#ifndef LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA +#define LV_DRAW_SW_RGB565_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(dsc, dest_px_size) \ + lv_draw_sw_blend_riscv_v_rgb565_to_rgb888_with_opa_mask(dsc, dest_px_size) +#endif + +/* RGB888/XRGB8888 image blend to RGB888/XRGB8888 */ +#ifndef LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888 +#define LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888(dsc, dest_px_size, src_px_size) \ + lv_draw_sw_blend_riscv_v_rgb888_to_rgb888(dsc, dest_px_size, src_px_size) +#endif + +#ifndef LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_WITH_OPA +#define LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_WITH_OPA(dsc, dest_px_size, src_px_size) \ + lv_draw_sw_blend_riscv_v_rgb888_to_rgb888_with_opa(dsc, dest_px_size, src_px_size) +#endif + +#ifndef LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_WITH_MASK +#define LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_WITH_MASK(dsc, dest_px_size, src_px_size) \ + lv_draw_sw_blend_riscv_v_rgb888_to_rgb888_with_mask(dsc, dest_px_size, src_px_size) +#endif + +#ifndef LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA +#define LV_DRAW_SW_RGB888_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(dsc, dest_px_size, src_px_size) \ + lv_draw_sw_blend_riscv_v_rgb888_to_rgb888_with_opa_mask(dsc, dest_px_size, src_px_size) +#endif + +/* ARGB8888 image blend to RGB888/XRGB8888 */ +#ifndef LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888 +#define LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888(dsc, dest_px_size) \ + lv_draw_sw_blend_riscv_v_argb8888_to_rgb888(dsc, dest_px_size) +#endif + +#ifndef LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_WITH_OPA +#define LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_WITH_OPA(dsc, dest_px_size) \ + lv_draw_sw_blend_riscv_v_argb8888_to_rgb888_with_opa(dsc, dest_px_size) +#endif + +#ifndef LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_WITH_MASK +#define LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_WITH_MASK(dsc, dest_px_size) \ + lv_draw_sw_blend_riscv_v_argb8888_to_rgb888_with_mask(dsc, dest_px_size) +#endif + +#ifndef LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA +#define LV_DRAW_SW_ARGB8888_BLEND_NORMAL_TO_RGB888_MIX_MASK_OPA(dsc, dest_px_size) \ + lv_draw_sw_blend_riscv_v_argb8888_to_rgb888_with_opa_mask(dsc, dest_px_size) +#endif + +#ifndef LV_DRAW_SW_ARGB8888_PREMULTIPLIED_BLEND_NORMAL_TO_RGB888 +#define LV_DRAW_SW_ARGB8888_PREMULTIPLIED_BLEND_NORMAL_TO_RGB888(dsc, dest_px_size) \ + lv_draw_sw_blend_riscv_v_argb8888_premultiplied_to_rgb888(dsc, dest_px_size) +#endif + +/********************** + * TYPEDEFS + **********************/ + +/********************** + * GLOBAL PROTOTYPES + **********************/ + +/* Color fill functions */ +lv_result_t lv_draw_sw_blend_riscv_v_color_to_rgb888(lv_draw_sw_blend_fill_dsc_t * dsc, uint32_t dest_px_size); +lv_result_t lv_draw_sw_blend_riscv_v_color_to_rgb888_with_opa(lv_draw_sw_blend_fill_dsc_t * dsc, uint32_t dest_px_size); +lv_result_t lv_draw_sw_blend_riscv_v_color_to_rgb888_with_mask(lv_draw_sw_blend_fill_dsc_t * dsc, + uint32_t dest_px_size); +lv_result_t lv_draw_sw_blend_riscv_v_color_to_rgb888_with_opa_mask(lv_draw_sw_blend_fill_dsc_t * dsc, + uint32_t dest_px_size); + +/* RGB565 to RGB888/XRGB8888 blend functions */ +lv_result_t lv_draw_sw_blend_riscv_v_rgb565_to_rgb888(lv_draw_sw_blend_image_dsc_t * dsc, uint32_t dest_px_size); +lv_result_t lv_draw_sw_blend_riscv_v_rgb565_to_rgb888_with_opa(lv_draw_sw_blend_image_dsc_t * dsc, + uint32_t dest_px_size); +lv_result_t lv_draw_sw_blend_riscv_v_rgb565_to_rgb888_with_mask(lv_draw_sw_blend_image_dsc_t * dsc, + uint32_t dest_px_size); +lv_result_t lv_draw_sw_blend_riscv_v_rgb565_to_rgb888_with_opa_mask(lv_draw_sw_blend_image_dsc_t * dsc, + uint32_t dest_px_size); + +/* RGB888/XRGB8888 to RGB888/XRGB8888 blend functions */ +lv_result_t lv_draw_sw_blend_riscv_v_rgb888_to_rgb888(lv_draw_sw_blend_image_dsc_t * dsc, uint32_t dest_px_size, + uint32_t src_px_size); +lv_result_t lv_draw_sw_blend_riscv_v_rgb888_to_rgb888_with_opa(lv_draw_sw_blend_image_dsc_t * dsc, + uint32_t dest_px_size, uint32_t src_px_size); +lv_result_t lv_draw_sw_blend_riscv_v_rgb888_to_rgb888_with_mask(lv_draw_sw_blend_image_dsc_t * dsc, + uint32_t dest_px_size, uint32_t src_px_size); +lv_result_t lv_draw_sw_blend_riscv_v_rgb888_to_rgb888_with_opa_mask(lv_draw_sw_blend_image_dsc_t * dsc, + uint32_t dest_px_size, uint32_t src_px_size); + +/* ARGB8888 to RGB888/XRGB8888 blend functions */ +lv_result_t lv_draw_sw_blend_riscv_v_argb8888_to_rgb888(lv_draw_sw_blend_image_dsc_t * dsc, uint32_t dest_px_size); +lv_result_t lv_draw_sw_blend_riscv_v_argb8888_to_rgb888_with_opa(lv_draw_sw_blend_image_dsc_t * dsc, + uint32_t dest_px_size); +lv_result_t lv_draw_sw_blend_riscv_v_argb8888_to_rgb888_with_mask(lv_draw_sw_blend_image_dsc_t * dsc, + uint32_t dest_px_size); +lv_result_t lv_draw_sw_blend_riscv_v_argb8888_to_rgb888_with_opa_mask(lv_draw_sw_blend_image_dsc_t * dsc, + uint32_t dest_px_size); +lv_result_t lv_draw_sw_blend_riscv_v_argb8888_premultiplied_to_rgb888(lv_draw_sw_blend_image_dsc_t * dsc, + uint32_t dest_px_size); + +/********************** + * MACROS + **********************/ + +#endif /* LV_USE_DRAW_SW_ASM == LV_DRAW_SW_ASM_RISCV_V */ + +#ifdef __cplusplus +} +#endif + +#endif /* LV_DRAW_SW_BLEND_RISCV_V_TO_RGB888_H */ diff --git a/src/lv_conf_internal.h b/src/lv_conf_internal.h index 04fa55f786..414c3eec4f 100644 --- a/src/lv_conf_internal.h +++ b/src/lv_conf_internal.h @@ -28,6 +28,7 @@ #define LV_DRAW_SW_ASM_NONE 0 #define LV_DRAW_SW_ASM_NEON 1 #define LV_DRAW_SW_ASM_HELIUM 2 +#define LV_DRAW_SW_ASM_RISCV_V 3 #define LV_DRAW_SW_ASM_CUSTOM 255 #define LV_NEMA_HAL_CUSTOM 0 diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index 11a27d3a5a..9c609dbdbe 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -69,6 +69,12 @@ set(LVGL_TEST_OPTIONS_VG_LITE -Wno-dangling-pointer # workaround for thorvg dangling-pointer warning ) +set(LVGL_TEST_OPTIONS_RISCV_V + -DLV_TEST_OPTION=5 + -DLVGL_CI_USING_SYS_HEAP + -DLV_USE_DRAW_SW_ASM=LV_DRAW_SW_ASM_RISCV_V +) + set(LVGL_TEST_OPTIONS_SDL -DLV_TEST_OPTION=7 ) @@ -149,6 +155,13 @@ elseif (OPTIONS_TEST_VG_LITE) # Set a tolerance value for the VG-Lite tests. add_definitions(-DREF_IMG_TOLERANCE=9) endif() +elseif (OPTIONS_TEST_RISCV_V) + set (BUILD_OPTIONS ${LVGL_TEST_OPTIONS_RISCV_V} ${SANITIZE_AND_COVERAGE_OPTIONS}) + filter_compiler_options (C TEST_LIBS ${SANITIZE_AND_COVERAGE_OPTIONS}) + set (CONFIG_LV_BUILD_EXAMPLES OFF CACHE BOOL "disable examples" FORCE) + set (ENABLE_TESTS ON) + add_definitions(-DREF_IMGS_PATH="ref_imgs/") + message(STATUS "RISC-V Vector (RVV) software emulation test enabled") else() message(FATAL_ERROR "Must provide a known options value (check main.py?).") endif() diff --git a/tests/main.py b/tests/main.py index 115f7f65e1..4ddc4b3b0b 100755 --- a/tests/main.py +++ b/tests/main.py @@ -34,6 +34,7 @@ test_options = { 'OPTIONS_TEST_SYSHEAP': 'Test config, system heap, 32 bit color depth', 'OPTIONS_TEST_DEFHEAP': 'Test config, LVGL heap, 32 bit color depth', 'OPTIONS_TEST_VG_LITE': 'VG-Lite simulator with full config, 32 bit color depth', + 'OPTIONS_TEST_RISCV_V': 'RISC-V Vector emulation with full config, 32 bit color depth', }