source_root = meson.project_source_root()
mod_features = import('features')

SSE = mod_features.new(
  'SSE', 1,  args: '-msse',
  test_code: files(source_root + '/numpy/distutils/checks/cpu_sse.c')[0]
)
SSE2 = mod_features.new(
  'SSE2', 2, implies: SSE,
  args: '-msse2',
  test_code: files(source_root + '/numpy/distutils/checks/cpu_sse2.c')[0]
)
# enabling SSE without SSE2 is useless also it's non-optional for x86_64
SSE.update(implies: SSE2)
SSE3 = mod_features.new(
  'SSE3', 3, implies: SSE2,
  args: '-msse3',
  test_code: files(source_root + '/numpy/distutils/checks/cpu_sse3.c')[0]
)
SSSE3 = mod_features.new(
  'SSSE3', 4, implies: SSE3,
  args: '-mssse3',
  test_code: files(source_root + '/numpy/distutils/checks/cpu_ssse3.c')[0]
)
SSE41 = mod_features.new(
  'SSE41', 5, implies: SSSE3,
  args: '-msse4.1',
  test_code: files(source_root + '/numpy/distutils/checks/cpu_sse41.c')[0]
)
POPCNT = mod_features.new(
  'POPCNT', 6, implies: SSE41,
  args: '-mpopcnt',
  test_code: files(source_root + '/numpy/distutils/checks/cpu_popcnt.c')[0]
)
SSE42 = mod_features.new(
  'SSE42', 7, implies: POPCNT, args: '-msse4.2',
  test_code: files(source_root + '/numpy/distutils/checks/cpu_sse42.c')[0]
)
# 7-20 left as margin for any extra features
AVX = mod_features.new(
  'AVX', 20, implies: SSE42, args: '-mavx',
  detect: {'val': 'AVX', 'match': '.*SSE.*'},
  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx.c')[0]
)
XOP = mod_features.new(
  'XOP', 21, implies: AVX, args: '-mxop',
  test_code: files(source_root + '/numpy/distutils/checks/cpu_xop.c')[0]
)
FMA4 = mod_features.new(
  'FMA4', 22, implies: AVX, args: '-mfma4',
  test_code: files(source_root + '/numpy/distutils/checks/cpu_fma4.c')[0]
)
# x86 half-precision
F16C = mod_features.new(
  'F16C', 23, implies: AVX, args: '-mf16c',
  test_code: files(source_root + '/numpy/distutils/checks/cpu_f16c.c')[0]
)
FMA3 = mod_features.new(
  'FMA3', 24, implies: F16C, args: '-mfma',
  test_code: files(source_root + '/numpy/distutils/checks/cpu_fma3.c')[0]
)
# match this to HWY_AVX2
AVX2 = mod_features.new(
  'AVX2', 25, implies: FMA3, args: ['-mavx2', '-maes', '-mpclmul', '-mbmi', '-mbmi2'],
  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx2.c')[0]
)
# 25-40 left as margin for any extra features
AVX512F = mod_features.new(
  'AVX512F', 40, implies: [AVX2],
  # Disables mmx because of stack corruption that may happen during mask
  # conversions.
  # TODO (seiko2plus): provide more clarification
  args: ['-mno-mmx', '-mavx512f'],
  detect: {'val': 'AVX512F', 'match': '.*'},
  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512f.c')[0],
  extra_tests: {
    'AVX512F_REDUCE': files(source_root + '/numpy/distutils/checks/extra_avx512f_reduce.c')[0]
  }
)
AVX512CD = mod_features.new(
  'AVX512CD', 41, implies: AVX512F, args: '-mavx512cd',
  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512cd.c')[0]
)
AVX512_KNL = mod_features.new(
  'AVX512_KNL', 42, implies: AVX512CD, args: ['-mavx512er', '-mavx512pf'],
  group: ['AVX512ER', 'AVX512PF'],
  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_knl.c')[0]
)
AVX512_KNM = mod_features.new(
  'AVX512_KNM', 43, implies: AVX512_KNL,
  args: ['-mavx5124fmaps', '-mavx5124vnniw', '-mavx512vpopcntdq'],
  group: ['AVX5124FMAPS', 'AVX5124VNNIW', 'AVX512VPOPCNTDQ'],
  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_knm.c')[0]
)
AVX512_SKX = mod_features.new(
  'AVX512_SKX', 50, implies: AVX512CD,
  args: ['-mavx512vl', '-mavx512bw', '-mavx512dq'],
  group: ['AVX512VL', 'AVX512BW', 'AVX512DQ'],
  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_skx.c')[0],
  extra_tests: {
    'AVX512BW_MASK': files(source_root + '/numpy/distutils/checks/extra_avx512bw_mask.c')[0],
    'AVX512DQ_MASK': files(source_root + '/numpy/distutils/checks/extra_avx512dq_mask.c')[0]
  }
)
AVX512_CLX = mod_features.new(
  'AVX512_CLX', 51, implies: AVX512_SKX, args: '-mavx512vnni',
  group: ['AVX512VNNI'],
  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_clx.c')[0]
)
AVX512_CNL = mod_features.new(
  'AVX512_CNL', 52, implies: AVX512_SKX,
  args: ['-mavx512ifma', '-mavx512vbmi'],
  group: ['AVX512IFMA', 'AVX512VBMI'],
  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_cnl.c')[0]
)
AVX512_ICL = mod_features.new(
  'AVX512_ICL', 53, implies: [AVX512_CLX, AVX512_CNL],
  args: ['-mavx512vbmi2', '-mavx512bitalg', '-mavx512vpopcntdq'],
  group: ['AVX512VBMI2', 'AVX512BITALG', 'AVX512VPOPCNTDQ'],
  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_icl.c')[0]
)
# TODO add support for zen4
AVX512_SPR = mod_features.new(
  'AVX512_SPR', 55, implies: AVX512_ICL,
  args: ['-mavx512fp16'],
  group: ['AVX512FP16'],
  test_code: files(source_root + '/numpy/distutils/checks/cpu_avx512_spr.c')[0]
)

# Specializations for non unix-like compilers
# -------------------------------------------
cpu_family = host_machine.cpu_family()
compiler_id = meson.get_compiler('c').get_id()
if compiler_id not in ['gcc', 'clang']
  AVX512_SPR.update(disable: compiler_id + ' compiler does not support it')
endif

# Common specializations between both Intel compilers (unix-like and msvc-like)
if compiler_id in ['intel', 'intel-cl']
  # POPCNT, and F16C don't own private FLAGS however the compiler still
  # provides ISA capability for them.
  POPCNT.update(args: '')
  F16C.update(args: '')
  # Intel compilers don't support the following features independently
  FMA3.update(implies: [F16C, AVX2])
  AVX2.update(implies: [F16C, FMA3])
  AVX512F.update(implies: [AVX2, AVX512CD])
  AVX512CD.update(implies: [AVX512F])
  XOP.update(disable: 'Intel Compiler does not support it')
  FMA4.update(disable: 'Intel Compiler does not support it')
endif

if compiler_id == 'intel-cl'
  foreach fet : [SSE, SSE2, SSE3, SSSE3, AVX]
    fet.update(args: {'val': '/arch:' + fet.get('name'), 'match': '/arch:.*'})
  endforeach
  SSE41.update(args: {'val': '/arch:SSE4.1', 'match': '/arch:.*'})
  SSE42.update(args: {'val': '/arch:SSE4.2', 'match': '/arch:.*'})
  FMA3.update(args: {'val': '/arch:CORE-AVX2', 'match': '/arch:.*'})
  AVX2.update(args: {'val': '/arch:CORE-AVX2', 'match': '/arch:.*'})
  AVX512F.update(args: {'val': '/Qx:COMMON-AVX512', 'match': '/arch:.*'})
  AVX512CD.update(args: {'val': '/Qx:COMMON-AVX512', 'match': '/arch:.*'})
  AVX512_KNL.update(args: {'val': '/Qx:KNL', 'match': '/[arch|Qx]:.*'})
  AVX512_KNM.update(args: {'val': '/Qx:KNM', 'match': '/[arch|Qx]:.*'})
  AVX512_SKX.update(args: {'val': '/Qx:SKYLAKE-AVX512', 'match': '/[arch|Qx]:.*'})
  AVX512_CLX.update(args: {'val': '/Qx:CASCADELAKE', 'match': '/[arch|Qx]:.*'})
  AVX512_CNL.update(args: {'val': '/Qx:CANNONLAKE', 'match': '/[arch|Qx]:.*'})
  AVX512_ICL.update(args: {'val': '/Qx:ICELAKE-CLIENT', 'match': '/[arch|Qx]:.*'})
endif

if compiler_id == 'intel'
  clear_m = '^(-mcpu=|-march=)'
  clear_any = '^(-mcpu=|-march=|-x[A-Z0-9\-])'
  FMA3.update(args: {'val': '-xCORE-AVX2', 'match': clear_m})
  AVX2.update(args: {'val': '-xCORE-AVX2', 'match': clear_m})
  AVX512F.update(args: {'val': '-xCOMMON-AVX512', 'match': clear_m})
  AVX512CD.update(args: {'val': '-xCOMMON-AVX512', 'match': clear_m})
  AVX512_KNL.update(args: {'val': '-xKNL', 'match': clear_any})
  AVX512_KNM.update(args: {'val': '-xKNM', 'match': clear_any})
  AVX512_SKX.update(args: {'val': '-xSKYLAKE-AVX512', 'match': clear_any})
  AVX512_CLX.update(args: {'val': '-xCASCADELAKE', 'match': clear_any})
  AVX512_CNL.update(args: {'val': '-xCANNONLAKE', 'match': clear_any})
  AVX512_ICL.update(args: {'val': '-xICELAKE-CLIENT', 'match': clear_any})
endif

if compiler_id == 'msvc'
  # MSVC compiler doesn't support the following features
  foreach fet : [AVX512_KNL, AVX512_KNM]
    fet.update(disable: compiler_id + ' compiler does not support it')
  endforeach
  # The following features don't own private FLAGS, however the compiler still
  # provides ISA capability for them.
  foreach fet : [
    SSE3, SSSE3, SSE41, POPCNT, SSE42, AVX, F16C, XOP, FMA4,
    AVX512F, AVX512CD, AVX512_CLX, AVX512_CNL,
    AVX512_ICL
  ]
    fet.update(args: '')
  endforeach
  # MSVC compiler doesn't support the following features independently
  FMA3.update(implies: [F16C, AVX2])
  AVX2.update(implies: [F16C, FMA3])
  AVX512F.update(implies: [AVX2, AVX512CD, AVX512_SKX])
  AVX512CD.update(implies: [AVX512F, AVX512_SKX])
  clear_arch = '/arch:.*'
  # only available on 32-bit. Its enabled by default on 64-bit mode
  foreach fet : [SSE, SSE2]
    if cpu_family == 'x86'
      fet.update(args: {'val': '/arch:' + fet.get('name'), 'match': clear_arch})
    else
      fet.update(args: '')
    endif
  endforeach
  FMA3.update(args: {'val': '/arch:AVX2', 'match': clear_arch})
  # Add floating-point contract flag to fixes transcendental function accuracy on Windows Server 2022
  FMA3.update(args: {'val': '/fp:contract'})
  AVX2.update(args: {'val': '/arch:AVX2', 'match': clear_arch})
  AVX512_SKX.update(args: {'val': '/arch:AVX512', 'match': clear_arch})
endif

X86_FEATURES = {
 'SSE': SSE, 'SSE2': SSE2, 'SSE3': SSE3, 'SSSE3': SSSE3,
 'SSE41': SSE41, 'POPCNT': POPCNT, 'SSE42': SSE42, 'AVX': AVX,
 'XOP': XOP, 'FMA4': FMA4, 'F16C': F16C, 'FMA3': FMA3,
 'AVX2': AVX2, 'AVX512F': AVX512F, 'AVX512CD': AVX512CD,
 'AVX512_KNL': AVX512_KNL, 'AVX512_KNM': AVX512_KNM,
 'AVX512_SKX': AVX512_SKX, 'AVX512_CLX': AVX512_CLX,
 'AVX512_CNL': AVX512_CNL, 'AVX512_ICL': AVX512_ICL,
 'AVX512_SPR': AVX512_SPR
}