import copy import itertools import os import pickle import sys import tempfile import pytest import numpy as np from numpy._core.tests._natype import get_stringdtype_dtype as get_dtype, pd_NA from numpy.dtypes import StringDType from numpy.testing import IS_PYPY, assert_array_equal @pytest.fixture def string_list(): return ["abc", "def", "ghi" * 10, "Aยขโ˜ƒโ‚ฌ ๐Ÿ˜Š" * 100, "Abc" * 1000, "DEF"] # second copy for cast tests to do a cartesian product over dtypes @pytest.fixture(params=[True, False]) def coerce2(request): return request.param @pytest.fixture( params=["unset", None, pd_NA, np.nan, float("nan"), "__nan__"], ids=["unset", "None", "pandas.NA", "np.nan", "float('nan')", "string nan"], ) def na_object2(request): return request.param @pytest.fixture() def dtype2(na_object2, coerce2): # explicit is check for pd_NA because != with pd_NA returns pd_NA if na_object2 is pd_NA or na_object2 != "unset": return StringDType(na_object=na_object2, coerce=coerce2) else: return StringDType(coerce=coerce2) def test_dtype_creation(): hashes = set() dt = StringDType() assert not hasattr(dt, "na_object") and dt.coerce is True hashes.add(hash(dt)) dt = StringDType(na_object=None) assert dt.na_object is None and dt.coerce is True hashes.add(hash(dt)) dt = StringDType(coerce=False) assert not hasattr(dt, "na_object") and dt.coerce is False hashes.add(hash(dt)) dt = StringDType(na_object=None, coerce=False) assert dt.na_object is None and dt.coerce is False hashes.add(hash(dt)) assert len(hashes) == 4 dt = np.dtype("T") assert dt == StringDType() assert dt.kind == "T" assert dt.char == "T" hashes.add(hash(dt)) assert len(hashes) == 4 def test_dtype_equality(dtype): assert dtype == dtype for ch in "SU": assert dtype != np.dtype(ch) assert dtype != np.dtype(f"{ch}8") def test_dtype_repr(dtype): if not hasattr(dtype, "na_object") and dtype.coerce: assert repr(dtype) == "StringDType()" elif dtype.coerce: assert repr(dtype) == f"StringDType(na_object={dtype.na_object!r})" elif not hasattr(dtype, "na_object"): assert repr(dtype) == "StringDType(coerce=False)" else: assert ( repr(dtype) == f"StringDType(na_object={dtype.na_object!r}, coerce=False)" ) def test_create_with_na(dtype): if not hasattr(dtype, "na_object"): pytest.skip("does not have an na object") na_val = dtype.na_object string_list = ["hello", na_val, "world"] arr = np.array(string_list, dtype=dtype) assert str(arr) == "[" + " ".join([repr(s) for s in string_list]) + "]" assert arr[1] is dtype.na_object @pytest.mark.parametrize("i", list(range(5))) def test_set_replace_na(i): # Test strings of various lengths can be set to NaN and then replaced. s_empty = "" s_short = "0123456789" s_medium = "abcdefghijklmnopqrstuvwxyz" s_long = "-=+" * 100 strings = [s_medium, s_empty, s_short, s_medium, s_long] a = np.array(strings, StringDType(na_object=np.nan)) for s in [a[i], s_medium + s_short, s_short, s_empty, s_long]: a[i] = np.nan assert np.isnan(a[i]) a[i] = s assert a[i] == s assert_array_equal(a, strings[:i] + [s] + strings[i + 1:]) def test_null_roundtripping(): data = ["hello\0world", "ABC\0DEF\0\0"] arr = np.array(data, dtype="T") assert data[0] == arr[0] assert data[1] == arr[1] def test_string_too_large_error(): arr = np.array(["a", "b", "c"], dtype=StringDType()) with pytest.raises(OverflowError): arr * (sys.maxsize + 1) @pytest.mark.parametrize( "data", [ ["abc", "def", "ghi"], ["๐Ÿคฃ", "๐Ÿ“ต", "๐Ÿ˜ฐ"], ["๐Ÿšœ", "๐Ÿ™ƒ", "๐Ÿ˜พ"], ["๐Ÿ˜น", "๐Ÿš ", "๐ŸšŒ"], ], ) def test_array_creation_utf8(dtype, data): arr = np.array(data, dtype=dtype) assert str(arr) == "[" + " ".join(["'" + str(d) + "'" for d in data]) + "]" assert arr.dtype == dtype @pytest.mark.parametrize( "data", [ [1, 2, 3], [b"abc", b"def", b"ghi"], [object, object, object], ], ) def test_scalars_string_conversion(data, dtype): try: str_vals = [str(d.decode('utf-8')) for d in data] except AttributeError: str_vals = [str(d) for d in data] if dtype.coerce: assert_array_equal( np.array(data, dtype=dtype), np.array(str_vals, dtype=dtype), ) else: with pytest.raises(ValueError): np.array(data, dtype=dtype) @pytest.mark.parametrize( ("strings"), [ ["this", "is", "an", "array"], ["โ‚ฌ", "", "๐Ÿ˜Š"], ["Aยขโ˜ƒโ‚ฌ ๐Ÿ˜Š", " Aโ˜ƒโ‚ฌยข๐Ÿ˜Š", "โ˜ƒโ‚ฌ๐Ÿ˜Š Aยข", "๐Ÿ˜Šโ˜ƒAยข โ‚ฌ"], ], ) def test_self_casts(dtype, dtype2, strings): if hasattr(dtype, "na_object"): strings = strings + [dtype.na_object] elif hasattr(dtype2, "na_object"): strings = strings + [""] arr = np.array(strings, dtype=dtype) newarr = arr.astype(dtype2) if hasattr(dtype, "na_object") and not hasattr(dtype2, "na_object"): assert newarr[-1] == str(dtype.na_object) with pytest.raises(TypeError): arr.astype(dtype2, casting="safe") elif hasattr(dtype, "na_object") and hasattr(dtype2, "na_object"): assert newarr[-1] is dtype2.na_object arr.astype(dtype2, casting="safe") elif hasattr(dtype2, "na_object"): assert newarr[-1] == "" arr.astype(dtype2, casting="safe") else: arr.astype(dtype2, casting="safe") if hasattr(dtype, "na_object") and hasattr(dtype2, "na_object"): na1 = dtype.na_object na2 = dtype2.na_object if (na1 is not na2 and # check for pd_NA first because bool(pd_NA) is an error ((na1 is pd_NA or na2 is pd_NA) or # the second check is a NaN check, spelled this way # to avoid errors from math.isnan and np.isnan (na1 != na2 and not (na1 != na1 and na2 != na2)))): with pytest.raises(TypeError): arr[:-1] == newarr[:-1] return assert_array_equal(arr[:-1], newarr[:-1]) @pytest.mark.parametrize( ("strings"), [ ["this", "is", "an", "array"], ["โ‚ฌ", "", "๐Ÿ˜Š"], ["Aยขโ˜ƒโ‚ฌ ๐Ÿ˜Š", " Aโ˜ƒโ‚ฌยข๐Ÿ˜Š", "โ˜ƒโ‚ฌ๐Ÿ˜Š Aยข", "๐Ÿ˜Šโ˜ƒAยข โ‚ฌ"], ], ) class TestStringLikeCasts: def test_unicode_casts(self, dtype, strings): arr = np.array(strings, dtype=np.str_).astype(dtype) expected = np.array(strings, dtype=dtype) assert_array_equal(arr, expected) arr_as_U8 = expected.astype("U8") assert_array_equal(arr_as_U8, np.array(strings, dtype="U8")) assert_array_equal(arr_as_U8.astype(dtype), arr) arr_as_U3 = expected.astype("U3") assert_array_equal(arr_as_U3, np.array(strings, dtype="U3")) assert_array_equal( arr_as_U3.astype(dtype), np.array([s[:3] for s in strings], dtype=dtype), ) def test_void_casts(self, dtype, strings): sarr = np.array(strings, dtype=dtype) utf8_bytes = [s.encode("utf-8") for s in strings] void_dtype = f"V{max(len(s) for s in utf8_bytes)}" varr = np.array(utf8_bytes, dtype=void_dtype) assert_array_equal(varr, sarr.astype(void_dtype)) assert_array_equal(varr.astype(dtype), sarr) def test_bytes_casts(self, dtype, strings): sarr = np.array(strings, dtype=dtype) try: utf8_bytes = [s.encode("ascii") for s in strings] bytes_dtype = f"S{max(len(s) for s in utf8_bytes)}" barr = np.array(utf8_bytes, dtype=bytes_dtype) assert_array_equal(barr, sarr.astype(bytes_dtype)) assert_array_equal(barr.astype(dtype), sarr) if dtype.coerce: barr = np.array(utf8_bytes, dtype=dtype) assert_array_equal(barr, sarr) barr = np.array(utf8_bytes, dtype="O") assert_array_equal(barr.astype(dtype), sarr) else: with pytest.raises(ValueError): np.array(utf8_bytes, dtype=dtype) except UnicodeEncodeError: with pytest.raises(UnicodeEncodeError): sarr.astype("S20") def test_additional_unicode_cast(random_string_list, dtype): arr = np.array(random_string_list, dtype=dtype) # test that this short-circuits correctly assert_array_equal(arr, arr.astype(arr.dtype)) # tests the casts via the comparison promoter assert_array_equal(arr, arr.astype(random_string_list.dtype)) def test_insert_scalar(dtype, string_list): """Test that inserting a scalar works.""" arr = np.array(string_list, dtype=dtype) scalar_instance = "what" arr[1] = scalar_instance assert_array_equal( arr, np.array(string_list[:1] + ["what"] + string_list[2:], dtype=dtype), ) comparison_operators = [ np.equal, np.not_equal, np.greater, np.greater_equal, np.less, np.less_equal, ] @pytest.mark.parametrize("op", comparison_operators) @pytest.mark.parametrize("o_dtype", [np.str_, object, StringDType()]) def test_comparisons(string_list, dtype, op, o_dtype): sarr = np.array(string_list, dtype=dtype) oarr = np.array(string_list, dtype=o_dtype) # test that comparison operators work res = op(sarr, sarr) ores = op(oarr, oarr) # test that promotion works as well orres = op(sarr, oarr) olres = op(oarr, sarr) assert_array_equal(res, ores) assert_array_equal(res, orres) assert_array_equal(res, olres) # test we get the correct answer for unequal length strings sarr2 = np.array([s + "2" for s in string_list], dtype=dtype) oarr2 = np.array([s + "2" for s in string_list], dtype=o_dtype) res = op(sarr, sarr2) ores = op(oarr, oarr2) olres = op(oarr, sarr2) orres = op(sarr, oarr2) assert_array_equal(res, ores) assert_array_equal(res, olres) assert_array_equal(res, orres) res = op(sarr2, sarr) ores = op(oarr2, oarr) olres = op(oarr2, sarr) orres = op(sarr2, oarr) assert_array_equal(res, ores) assert_array_equal(res, olres) assert_array_equal(res, orres) def test_isnan(dtype, string_list): if not hasattr(dtype, "na_object"): pytest.skip("no na support") sarr = np.array(string_list + [dtype.na_object], dtype=dtype) is_nan = isinstance(dtype.na_object, float) and np.isnan(dtype.na_object) bool_errors = 0 try: bool(dtype.na_object) except TypeError: bool_errors = 1 if is_nan or bool_errors: # isnan is only true when na_object is a NaN assert_array_equal( np.isnan(sarr), np.array([0] * len(string_list) + [1], dtype=np.bool), ) else: assert not np.any(np.isnan(sarr)) def test_pickle(dtype, string_list): arr = np.array(string_list, dtype=dtype) with tempfile.NamedTemporaryFile("wb", delete=False) as f: pickle.dump([arr, dtype], f) with open(f.name, "rb") as f: res = pickle.load(f) assert_array_equal(res[0], arr) assert res[1] == dtype os.remove(f.name) def test_stdlib_copy(dtype, string_list): arr = np.array(string_list, dtype=dtype) assert_array_equal(copy.copy(arr), arr) assert_array_equal(copy.deepcopy(arr), arr) @pytest.mark.parametrize( "strings", [ ["left", "right", "leftovers", "righty", "up", "down"], [ "left" * 10, "right" * 10, "leftovers" * 10, "righty" * 10, "up" * 10, ], ["๐Ÿคฃ๐Ÿคฃ", "๐Ÿคฃ", "๐Ÿ“ต", "๐Ÿ˜ฐ"], ["๐Ÿšœ", "๐Ÿ™ƒ", "๐Ÿ˜พ"], ["๐Ÿ˜น", "๐Ÿš ", "๐ŸšŒ"], ["Aยขโ˜ƒโ‚ฌ ๐Ÿ˜Š", " Aโ˜ƒโ‚ฌยข๐Ÿ˜Š", "โ˜ƒโ‚ฌ๐Ÿ˜Š Aยข", "๐Ÿ˜Šโ˜ƒAยข โ‚ฌ"], ], ) def test_sort(dtype, strings): """Test that sorting matches python's internal sorting.""" def test_sort(strings, arr_sorted): arr = np.array(strings, dtype=dtype) na_object = getattr(arr.dtype, "na_object", "") if na_object is None and None in strings: with pytest.raises( ValueError, match="Cannot compare null that is not a nan-like value", ): np.argsort(arr) argsorted = None elif na_object is pd_NA or na_object != '': argsorted = None else: argsorted = np.argsort(arr) np.random.default_rng().shuffle(arr) if na_object is None and None in strings: with pytest.raises( ValueError, match="Cannot compare null that is not a nan-like value", ): arr.sort() else: arr.sort() assert np.array_equal(arr, arr_sorted, equal_nan=True) if argsorted is not None: assert np.array_equal(argsorted, np.argsort(strings)) # make a copy so we don't mutate the lists in the fixture strings = strings.copy() arr_sorted = np.array(sorted(strings), dtype=dtype) test_sort(strings, arr_sorted) if not hasattr(dtype, "na_object"): return # make sure NAs get sorted to the end of the array and string NAs get # sorted like normal strings strings.insert(0, dtype.na_object) strings.insert(2, dtype.na_object) # can't use append because doing that with NA converts # the result to object dtype if not isinstance(dtype.na_object, str): arr_sorted = np.array( arr_sorted.tolist() + [dtype.na_object, dtype.na_object], dtype=dtype, ) else: arr_sorted = np.array(sorted(strings), dtype=dtype) test_sort(strings, arr_sorted) @pytest.mark.parametrize( "strings", [ ["Aยขโ˜ƒโ‚ฌ ๐Ÿ˜Š", " Aโ˜ƒโ‚ฌยข๐Ÿ˜Š", "โ˜ƒโ‚ฌ๐Ÿ˜Š Aยข", "๐Ÿ˜Šโ˜ƒAยข โ‚ฌ"], ["Aยขโ˜ƒโ‚ฌ ๐Ÿ˜Š", "", " ", "๏€ "], ["", "a", "๐Ÿ˜ธ", "รกรกรฐfรกรญรณรฅรฉรซ"], ], ) def test_nonzero(strings, na_object): dtype = get_dtype(na_object) arr = np.array(strings, dtype=dtype) is_nonzero = np.array( [i for i, item in enumerate(strings) if len(item) != 0]) assert_array_equal(arr.nonzero()[0], is_nonzero) if na_object is not pd_NA and na_object == 'unset': return strings_with_na = np.array(strings + [na_object], dtype=dtype) is_nan = np.isnan(np.array([dtype.na_object], dtype=dtype))[0] if is_nan: assert strings_with_na.nonzero()[0][-1] == 4 else: assert strings_with_na.nonzero()[0][-1] == 3 # check that the casting to bool and nonzero give consistent results assert_array_equal(strings_with_na[strings_with_na.nonzero()], strings_with_na[strings_with_na.astype(bool)]) def test_where(string_list, na_object): dtype = get_dtype(na_object) a = np.array(string_list, dtype=dtype) b = a[::-1] res = np.where([True, False, True, False, True, False], a, b) assert_array_equal(res, [a[0], b[1], a[2], b[3], a[4], b[5]]) def test_fancy_indexing(string_list): sarr = np.array(string_list, dtype="T") assert_array_equal(sarr, sarr[np.arange(sarr.shape[0])]) inds = [ [True, True], [0, 1], ..., np.array([0, 1], dtype='uint8'), ] lops = [ ['a' * 25, 'b' * 25], ['', ''], ['hello', 'world'], ['hello', 'world' * 25], ] # see gh-27003 and gh-27053 for ind in inds: for lop in lops: a = np.array(lop, dtype="T") assert_array_equal(a[ind], a) rop = ['d' * 25, 'e' * 25] for b in [rop, np.array(rop, dtype="T")]: a[ind] = b assert_array_equal(a, b) assert a[0] == 'd' * 25 # see gh-29279 data = [ ["AAAAAAAAAAAAAAAAA"], ["BBBBBBBBBBBBBBBBBBBBBBBBBBBBB"], ["CCCCCCCCCCCCCCCCC"], ["DDDDDDDDDDDDDDDDD"], ] sarr = np.array(data, dtype=np.dtypes.StringDType()) uarr = np.array(data, dtype="U30") for ind in [[0], [1], [2], [3], [[0, 0]], [[1, 1, 3]], [[1, 1]]]: assert_array_equal(sarr[ind], uarr[ind]) def test_creation_functions(): assert_array_equal(np.zeros(3, dtype="T"), ["", "", ""]) assert_array_equal(np.empty(3, dtype="T"), ["", "", ""]) assert np.zeros(3, dtype="T")[0] == "" assert np.empty(3, dtype="T")[0] == "" def test_concatenate(string_list): sarr = np.array(string_list, dtype="T") sarr_cat = np.array(string_list + string_list, dtype="T") assert_array_equal(np.concatenate([sarr], axis=0), sarr) def test_resize_method(string_list): sarr = np.array(string_list, dtype="T") if IS_PYPY: sarr.resize(len(string_list) + 3, refcheck=False) else: sarr.resize(len(string_list) + 3) assert_array_equal(sarr, np.array(string_list + [''] * 3, dtype="T")) def test_create_with_copy_none(string_list): arr = np.array(string_list, dtype=StringDType()) # create another stringdtype array with an arena that has a different # in-memory layout than the first array arr_rev = np.array(string_list[::-1], dtype=StringDType()) # this should create a copy and the resulting array # shouldn't share an allocator or arena with arr_rev, despite # explicitly passing arr_rev.dtype arr_copy = np.array(arr, copy=None, dtype=arr_rev.dtype) np.testing.assert_array_equal(arr, arr_copy) assert arr_copy.base is None with pytest.raises(ValueError, match="Unable to avoid copy"): np.array(arr, copy=False, dtype=arr_rev.dtype) # because we're using arr's dtype instance, the view is safe arr_view = np.array(arr, copy=None, dtype=arr.dtype) np.testing.assert_array_equal(arr, arr) np.testing.assert_array_equal(arr_view[::-1], arr_rev) assert arr_view is arr def test_astype_copy_false(): orig_dt = StringDType() arr = np.array(["hello", "world"], dtype=StringDType()) assert not arr.astype(StringDType(coerce=False), copy=False).dtype.coerce assert arr.astype(orig_dt, copy=False).dtype is orig_dt @pytest.mark.parametrize( "strings", [ ["left", "right", "leftovers", "righty", "up", "down"], ["๐Ÿคฃ๐Ÿคฃ", "๐Ÿคฃ", "๐Ÿ“ต", "๐Ÿ˜ฐ"], ["๐Ÿšœ", "๐Ÿ™ƒ", "๐Ÿ˜พ"], ["๐Ÿ˜น", "๐Ÿš ", "๐ŸšŒ"], ["Aยขโ˜ƒโ‚ฌ ๐Ÿ˜Š", " Aโ˜ƒโ‚ฌยข๐Ÿ˜Š", "โ˜ƒโ‚ฌ๐Ÿ˜Š Aยข", "๐Ÿ˜Šโ˜ƒAยข โ‚ฌ"], ], ) def test_argmax(strings): """Test that argmax/argmin matches what python calculates.""" arr = np.array(strings, dtype="T") assert np.argmax(arr) == strings.index(max(strings)) assert np.argmin(arr) == strings.index(min(strings)) @pytest.mark.parametrize( "arrfunc,expected", [ [np.sort, None], [np.nonzero, (np.array([], dtype=np.int_),)], [np.argmax, 0], [np.argmin, 0], ], ) def test_arrfuncs_zeros(arrfunc, expected): arr = np.zeros(10, dtype="T") result = arrfunc(arr) if expected is None: expected = arr assert_array_equal(result, expected, strict=True) @pytest.mark.parametrize( ("strings", "cast_answer", "any_answer", "all_answer"), [ [["hello", "world"], [True, True], True, True], [["", ""], [False, False], False, False], [["hello", ""], [True, False], True, False], [["", "world"], [False, True], True, False], ], ) def test_cast_to_bool(strings, cast_answer, any_answer, all_answer): sarr = np.array(strings, dtype="T") assert_array_equal(sarr.astype("bool"), cast_answer) assert np.any(sarr) == any_answer assert np.all(sarr) == all_answer @pytest.mark.parametrize( ("strings", "cast_answer"), [ [[True, True], ["True", "True"]], [[False, False], ["False", "False"]], [[True, False], ["True", "False"]], [[False, True], ["False", "True"]], ], ) def test_cast_from_bool(strings, cast_answer): barr = np.array(strings, dtype=bool) assert_array_equal(barr.astype("T"), np.array(cast_answer, dtype="T")) @pytest.mark.parametrize("bitsize", [8, 16, 32, 64]) @pytest.mark.parametrize("signed", [True, False]) def test_sized_integer_casts(bitsize, signed): idtype = f"int{bitsize}" if signed: inp = [-(2**p - 1) for p in reversed(range(bitsize - 1))] inp += [2**p - 1 for p in range(1, bitsize - 1)] else: idtype = "u" + idtype inp = [2**p - 1 for p in range(bitsize)] ainp = np.array(inp, dtype=idtype) assert_array_equal(ainp, ainp.astype("T").astype(idtype)) # safe casting works ainp.astype("T", casting="safe") with pytest.raises(TypeError): ainp.astype("T").astype(idtype, casting="safe") oob = [str(2**bitsize), str(-(2**bitsize))] with pytest.raises(OverflowError): np.array(oob, dtype="T").astype(idtype) with pytest.raises(ValueError): np.array(["1", np.nan, "3"], dtype=StringDType(na_object=np.nan)).astype(idtype) @pytest.mark.parametrize("typename", ["byte", "short", "int", "longlong"]) @pytest.mark.parametrize("signed", ["", "u"]) def test_unsized_integer_casts(typename, signed): idtype = f"{signed}{typename}" inp = [1, 2, 3, 4] ainp = np.array(inp, dtype=idtype) assert_array_equal(ainp, ainp.astype("T").astype(idtype)) @pytest.mark.parametrize( "typename", [ pytest.param( "longdouble", marks=pytest.mark.xfail( np.dtypes.LongDoubleDType() != np.dtypes.Float64DType(), reason="numpy lacks an ld2a implementation", strict=True, ), ), "float64", "float32", "float16", ], ) def test_float_casts(typename): inp = [1.1, 2.8, -3.2, 2.7e4] ainp = np.array(inp, dtype=typename) assert_array_equal(ainp, ainp.astype("T").astype(typename)) inp = [0.1] sres = np.array(inp, dtype=typename).astype("T") res = sres.astype(typename) assert_array_equal(np.array(inp, dtype=typename), res) assert sres[0] == "0.1" if typename == "longdouble": # let's not worry about platform-dependent rounding of longdouble return fi = np.finfo(typename) inp = [1e-324, fi.smallest_subnormal, -1e-324, -fi.smallest_subnormal] eres = [0, fi.smallest_subnormal, -0, -fi.smallest_subnormal] res = np.array(inp, dtype=typename).astype("T").astype(typename) assert_array_equal(eres, res) inp = [2e308, fi.max, -2e308, fi.min] eres = [np.inf, fi.max, -np.inf, fi.min] res = np.array(inp, dtype=typename).astype("T").astype(typename) assert_array_equal(eres, res) def test_float_nan_cast_na_object(): # gh-28157 dt = np.dtypes.StringDType(na_object=np.nan) arr1 = np.full((1,), fill_value=np.nan, dtype=dt) arr2 = np.full_like(arr1, fill_value=np.nan) assert arr1.item() is np.nan assert arr2.item() is np.nan inp = [1.2, 2.3, np.nan] arr = np.array(inp).astype(dt) assert arr[2] is np.nan assert arr[0] == '1.2' @pytest.mark.parametrize( "typename", [ "csingle", "cdouble", pytest.param( "clongdouble", marks=pytest.mark.xfail( np.dtypes.CLongDoubleDType() != np.dtypes.Complex128DType(), reason="numpy lacks an ld2a implementation", strict=True, ), ), ], ) def test_cfloat_casts(typename): inp = [1.1 + 1.1j, 2.8 + 2.8j, -3.2 - 3.2j, 2.7e4 + 2.7e4j] ainp = np.array(inp, dtype=typename) assert_array_equal(ainp, ainp.astype("T").astype(typename)) inp = [0.1 + 0.1j] sres = np.array(inp, dtype=typename).astype("T") res = sres.astype(typename) assert_array_equal(np.array(inp, dtype=typename), res) assert sres[0] == "(0.1+0.1j)" def test_take(string_list): sarr = np.array(string_list, dtype="T") res = sarr.take(np.arange(len(string_list))) assert_array_equal(sarr, res) # make sure it also works for out out = np.empty(len(string_list), dtype="T") out[0] = "hello" res = sarr.take(np.arange(len(string_list)), out=out) assert res is out assert_array_equal(sarr, res) @pytest.mark.parametrize("use_out", [True, False]) @pytest.mark.parametrize( "ufunc_name,func", [ ("min", min), ("max", max), ], ) def test_ufuncs_minmax(string_list, ufunc_name, func, use_out): """Test that the min/max ufuncs match Python builtin min/max behavior.""" arr = np.array(string_list, dtype="T") uarr = np.array(string_list, dtype=str) res = np.array(func(string_list), dtype="T") assert_array_equal(getattr(arr, ufunc_name)(), res) ufunc = getattr(np, ufunc_name + "imum") if use_out: res = ufunc(arr, arr, out=arr) else: res = ufunc(arr, arr) assert_array_equal(uarr, res) assert_array_equal(getattr(arr, ufunc_name)(), func(string_list)) def test_max_regression(): arr = np.array(['y', 'y', 'z'], dtype="T") assert arr.max() == 'z' @pytest.mark.parametrize("use_out", [True, False]) @pytest.mark.parametrize( "other_strings", [ ["abc", "def" * 500, "ghi" * 16, "๐Ÿคฃ" * 100, "๐Ÿ“ต", "๐Ÿ˜ฐ"], ["๐Ÿšœ", "๐Ÿ™ƒ", "๐Ÿ˜พ", "๐Ÿ˜น", "๐Ÿš ", "๐ŸšŒ"], ["๐Ÿฅฆ", "ยจ", "โจฏ", "โˆฐ ", "โจŒ ", "โŽถ "], ], ) def test_ufunc_add(dtype, string_list, other_strings, use_out): arr1 = np.array(string_list, dtype=dtype) arr2 = np.array(other_strings, dtype=dtype) result = np.array([a + b for a, b in zip(arr1, arr2)], dtype=dtype) if use_out: res = np.add(arr1, arr2, out=arr1) else: res = np.add(arr1, arr2) assert_array_equal(res, result) if not hasattr(dtype, "na_object"): return is_nan = isinstance(dtype.na_object, float) and np.isnan(dtype.na_object) is_str = isinstance(dtype.na_object, str) bool_errors = 0 try: bool(dtype.na_object) except TypeError: bool_errors = 1 arr1 = np.array([dtype.na_object] + string_list, dtype=dtype) arr2 = np.array(other_strings + [dtype.na_object], dtype=dtype) if is_nan or bool_errors or is_str: res = np.add(arr1, arr2) assert_array_equal(res[1:-1], arr1[1:-1] + arr2[1:-1]) if not is_str: assert res[0] is dtype.na_object and res[-1] is dtype.na_object else: assert res[0] == dtype.na_object + arr2[0] assert res[-1] == arr1[-1] + dtype.na_object else: with pytest.raises(ValueError): np.add(arr1, arr2) def test_ufunc_add_reduce(dtype): values = ["a", "this is a long string", "c"] arr = np.array(values, dtype=dtype) out = np.empty((), dtype=dtype) expected = np.array("".join(values), dtype=dtype) assert_array_equal(np.add.reduce(arr), expected) np.add.reduce(arr, out=out) assert_array_equal(out, expected) def test_add_promoter(string_list): arr = np.array(string_list, dtype=StringDType()) lresult = np.array(["hello" + s for s in string_list], dtype=StringDType()) rresult = np.array([s + "hello" for s in string_list], dtype=StringDType()) for op in ["hello", np.str_("hello"), np.array(["hello"])]: assert_array_equal(op + arr, lresult) assert_array_equal(arr + op, rresult) # The promoter should be able to handle things if users pass `dtype=` res = np.add("hello", string_list, dtype=StringDType) assert res.dtype == StringDType() # The promoter should not kick in if users override the input, # which means arr is cast, this fails because of the unknown length. with pytest.raises(TypeError, match="cannot cast dtype"): np.add(arr, "add", signature=("U", "U", None), casting="unsafe") # But it must simply reject the following: with pytest.raises(TypeError, match=".*did not contain a loop"): np.add(arr, "add", signature=(None, "U", None)) with pytest.raises(TypeError, match=".*did not contain a loop"): np.add("a", "b", signature=("U", "U", StringDType)) def test_add_no_legacy_promote_with_signature(): # Possibly misplaced, but useful to test with string DType. We check that # if there is clearly no loop found, a stray `dtype=` doesn't break things # Regression test for the bad error in gh-26735 # (If legacy promotion is gone, this can be deleted...) with pytest.raises(TypeError, match=".*did not contain a loop"): np.add("3", 6, dtype=StringDType) def test_add_promoter_reduce(): # Exact TypeError could change, but ensure StringDtype doesn't match with pytest.raises(TypeError, match="the resolved dtypes are not"): np.add.reduce(np.array(["a", "b"], dtype="U")) # On the other hand, using `dtype=T` in the *ufunc* should work. np.add.reduce(np.array(["a", "b"], dtype="U"), dtype=np.dtypes.StringDType) def test_multiply_reduce(): # At the time of writing (NumPy 2.0) this is very limited (and rather # ridiculous anyway). But it works and actually makes some sense... # (NumPy does not allow non-scalar initial values) repeats = np.array([2, 3, 4]) val = "school-๐ŸšŒ" res = np.multiply.reduce(repeats, initial=val, dtype=np.dtypes.StringDType) assert res == val * np.prod(repeats) def test_multiply_two_string_raises(): arr = np.array(["hello", "world"], dtype="T") with pytest.raises(np._core._exceptions._UFuncNoLoopError): np.multiply(arr, arr) @pytest.mark.parametrize("use_out", [True, False]) @pytest.mark.parametrize("other", [2, [2, 1, 3, 4, 1, 3]]) @pytest.mark.parametrize( "other_dtype", [ None, "int8", "int16", "int32", "int64", "uint8", "uint16", "uint32", "uint64", "short", "int", "intp", "long", "longlong", "ushort", "uint", "uintp", "ulong", "ulonglong", ], ) def test_ufunc_multiply(dtype, string_list, other, other_dtype, use_out): """Test the two-argument ufuncs match python builtin behavior.""" arr = np.array(string_list, dtype=dtype) if other_dtype is not None: other_dtype = np.dtype(other_dtype) try: len(other) result = [s * o for s, o in zip(string_list, other)] other = np.array(other) if other_dtype is not None: other = other.astype(other_dtype) except TypeError: if other_dtype is not None: other = other_dtype.type(other) result = [s * other for s in string_list] if use_out: arr_cache = arr.copy() lres = np.multiply(arr, other, out=arr) assert_array_equal(lres, result) arr[:] = arr_cache assert lres is arr arr *= other assert_array_equal(arr, result) arr[:] = arr_cache rres = np.multiply(other, arr, out=arr) assert rres is arr assert_array_equal(rres, result) else: lres = arr * other assert_array_equal(lres, result) rres = other * arr assert_array_equal(rres, result) if not hasattr(dtype, "na_object"): return is_nan = np.isnan(np.array([dtype.na_object], dtype=dtype))[0] is_str = isinstance(dtype.na_object, str) bool_errors = 0 try: bool(dtype.na_object) except TypeError: bool_errors = 1 arr = np.array(string_list + [dtype.na_object], dtype=dtype) try: len(other) other = np.append(other, 3) if other_dtype is not None: other = other.astype(other_dtype) except TypeError: pass if is_nan or bool_errors or is_str: for res in [arr * other, other * arr]: assert_array_equal(res[:-1], result) if not is_str: assert res[-1] is dtype.na_object else: try: assert res[-1] == dtype.na_object * other[-1] except (IndexError, TypeError): assert res[-1] == dtype.na_object * other else: with pytest.raises(TypeError): arr * other with pytest.raises(TypeError): other * arr def test_findlike_promoters(): r = "Wally" l = "Where's Wally?" s = np.int32(3) e = np.int8(13) for dtypes in [("T", "U"), ("U", "T")]: for function, answer in [ (np.strings.index, 8), (np.strings.endswith, True), ]: assert answer == function( np.array(l, dtype=dtypes[0]), np.array(r, dtype=dtypes[1]), s, e ) def test_strip_promoter(): arg = ["Hello!!!!", "Hello??!!"] strip_char = "!" answer = ["Hello", "Hello??"] for dtypes in [("T", "U"), ("U", "T")]: result = np.strings.strip( np.array(arg, dtype=dtypes[0]), np.array(strip_char, dtype=dtypes[1]) ) assert_array_equal(result, answer) assert result.dtype.char == "T" def test_replace_promoter(): arg = ["Hello, planet!", "planet, Hello!"] old = "planet" new = "world" answer = ["Hello, world!", "world, Hello!"] for dtypes in itertools.product("TU", repeat=3): if dtypes == ("U", "U", "U"): continue answer_arr = np.strings.replace( np.array(arg, dtype=dtypes[0]), np.array(old, dtype=dtypes[1]), np.array(new, dtype=dtypes[2]), ) assert_array_equal(answer_arr, answer) assert answer_arr.dtype.char == "T" def test_center_promoter(): arg = ["Hello", "planet!"] fillchar = "/" for dtypes in [("T", "U"), ("U", "T")]: answer = np.strings.center( np.array(arg, dtype=dtypes[0]), 9, np.array(fillchar, dtype=dtypes[1]) ) assert_array_equal(answer, ["//Hello//", "/planet!/"]) assert answer.dtype.char == "T" DATETIME_INPUT = [ np.datetime64("1923-04-14T12:43:12"), np.datetime64("1994-06-21T14:43:15"), np.datetime64("2001-10-15T04:10:32"), np.datetime64("NaT"), np.datetime64("1995-11-25T16:02:16"), np.datetime64("2005-01-04T03:14:12"), np.datetime64("2041-12-03T14:05:03"), ] TIMEDELTA_INPUT = [ np.timedelta64(12358, "s"), np.timedelta64(23, "s"), np.timedelta64(74, "s"), np.timedelta64("NaT"), np.timedelta64(23, "s"), np.timedelta64(73, "s"), np.timedelta64(7, "s"), ] @pytest.mark.parametrize( "input_data, input_dtype", [ (DATETIME_INPUT, "M8[s]"), (TIMEDELTA_INPUT, "m8[s]") ] ) def test_datetime_timedelta_cast(dtype, input_data, input_dtype): a = np.array(input_data, dtype=input_dtype) has_na = hasattr(dtype, "na_object") is_str = isinstance(getattr(dtype, "na_object", None), str) if not has_na or is_str: a = np.delete(a, 3) sa = a.astype(dtype) ra = sa.astype(a.dtype) if has_na and not is_str: assert sa[3] is dtype.na_object assert np.isnat(ra[3]) assert_array_equal(a, ra) if has_na and not is_str: # don't worry about comparing how NaT is converted sa = np.delete(sa, 3) a = np.delete(a, 3) if input_dtype.startswith("M"): assert_array_equal(sa, a.astype("U")) else: # The timedelta to unicode cast produces strings # that aren't round-trippable and we don't want to # reproduce that behavior in stringdtype assert_array_equal(sa, a.astype("int64").astype("U")) def test_nat_casts(): s = 'nat' all_nats = itertools.product(*zip(s.upper(), s.lower())) all_nats = list(map(''.join, all_nats)) NaT_dt = np.datetime64('NaT') NaT_td = np.timedelta64('NaT') for na_object in [np._NoValue, None, np.nan, 'nat', '']: # numpy treats empty string and all case combinations of 'nat' as NaT dtype = StringDType(na_object=na_object) arr = np.array([''] + all_nats, dtype=dtype) dt_array = arr.astype('M8[s]') td_array = arr.astype('m8[s]') assert_array_equal(dt_array, NaT_dt) assert_array_equal(td_array, NaT_td) if na_object is np._NoValue: output_object = 'NaT' else: output_object = na_object for arr in [dt_array, td_array]: assert_array_equal( arr.astype(dtype), np.array([output_object] * arr.size, dtype=dtype)) def test_nat_conversion(): for nat in [np.datetime64("NaT", "s"), np.timedelta64("NaT", "s")]: with pytest.raises(ValueError, match="string coercion is disabled"): np.array(["a", nat], dtype=StringDType(coerce=False)) def test_growing_strings(dtype): # growing a string leads to a heap allocation, this tests to make sure # we do that bookkeeping correctly for all possible starting cases data = [ "hello", # a short string "abcdefghijklmnopqestuvwxyz", # a medium heap-allocated string "hello" * 200, # a long heap-allocated string ] arr = np.array(data, dtype=dtype) uarr = np.array(data, dtype=str) for _ in range(5): arr = arr + arr uarr = uarr + uarr assert_array_equal(arr, uarr) def test_assign_medium_strings(): # see gh-29261 N = 9 src = np.array( ( ['0' * 256] * 3 + ['0' * 255] + ['0' * 256] + ['0' * 255] + ['0' * 256] * 2 + ['0' * 255] ), dtype='T') dst = np.array( ( ['0' * 255] + ['0' * 256] * 2 + ['0' * 255] + ['0' * 256] + ['0' * 255] + [''] * 5 ), dtype='T') dst[1:N + 1] = src assert_array_equal(dst[1:N + 1], src) UFUNC_TEST_DATA = [ "hello" * 10, "Aeยขโ˜ƒโ‚ฌ ๐Ÿ˜Š" * 20, "entry\nwith\nnewlines", "entry\twith\ttabs", ] @pytest.fixture def string_array(dtype): return np.array(UFUNC_TEST_DATA, dtype=dtype) @pytest.fixture def unicode_array(): return np.array(UFUNC_TEST_DATA, dtype=np.str_) NAN_PRESERVING_FUNCTIONS = [ "capitalize", "expandtabs", "lower", "lstrip", "rstrip", "splitlines", "strip", "swapcase", "title", "upper", ] BOOL_OUTPUT_FUNCTIONS = [ "isalnum", "isalpha", "isdigit", "islower", "isspace", "istitle", "isupper", "isnumeric", "isdecimal", ] UNARY_FUNCTIONS = [ "str_len", "capitalize", "expandtabs", "isalnum", "isalpha", "isdigit", "islower", "isspace", "istitle", "isupper", "lower", "lstrip", "rstrip", "splitlines", "strip", "swapcase", "title", "upper", "isnumeric", "isdecimal", "isalnum", "islower", "istitle", "isupper", ] UNIMPLEMENTED_VEC_STRING_FUNCTIONS = [ "capitalize", "expandtabs", "lower", "splitlines", "swapcase", "title", "upper", ] ONLY_IN_NP_CHAR = [ "join", "split", "rsplit", "splitlines" ] @pytest.mark.parametrize("function_name", UNARY_FUNCTIONS) def test_unary(string_array, unicode_array, function_name): if function_name in ONLY_IN_NP_CHAR: func = getattr(np.char, function_name) else: func = getattr(np.strings, function_name) dtype = string_array.dtype sres = func(string_array) ures = func(unicode_array) if sres.dtype == StringDType(): ures = ures.astype(StringDType()) assert_array_equal(sres, ures) if not hasattr(dtype, "na_object"): return is_nan = np.isnan(np.array([dtype.na_object], dtype=dtype))[0] is_str = isinstance(dtype.na_object, str) na_arr = np.insert(string_array, 0, dtype.na_object) if function_name in UNIMPLEMENTED_VEC_STRING_FUNCTIONS: if not is_str: # to avoid these errors we'd need to add NA support to _vec_string with pytest.raises((ValueError, TypeError)): func(na_arr) elif function_name == "splitlines": assert func(na_arr)[0] == func(dtype.na_object)[()] else: assert func(na_arr)[0] == func(dtype.na_object) return if function_name == "str_len" and not is_str: # str_len always errors for any non-string null, even NA ones because # it has an integer result with pytest.raises(ValueError): func(na_arr) return if function_name in BOOL_OUTPUT_FUNCTIONS: if is_nan: assert func(na_arr)[0] is np.False_ elif is_str: assert func(na_arr)[0] == func(dtype.na_object) else: with pytest.raises(ValueError): func(na_arr) return if not (is_nan or is_str): with pytest.raises(ValueError): func(na_arr) return res = func(na_arr) if is_nan and function_name in NAN_PRESERVING_FUNCTIONS: assert res[0] is dtype.na_object elif is_str: assert res[0] == func(dtype.na_object) unicode_bug_fail = pytest.mark.xfail( reason="unicode output width is buggy", strict=True ) # None means that the argument is a string array BINARY_FUNCTIONS = [ ("add", (None, None)), ("multiply", (None, 2)), ("mod", ("format: %s", None)), ("center", (None, 25)), ("count", (None, "A")), ("encode", (None, "UTF-8")), ("endswith", (None, "lo")), ("find", (None, "A")), ("index", (None, "e")), ("join", ("-", None)), ("ljust", (None, 12)), ("lstrip", (None, "A")), ("partition", (None, "A")), ("replace", (None, "A", "B")), ("rfind", (None, "A")), ("rindex", (None, "e")), ("rjust", (None, 12)), ("rsplit", (None, "A")), ("rstrip", (None, "A")), ("rpartition", (None, "A")), ("split", (None, "A")), ("strip", (None, "A")), ("startswith", (None, "A")), ("zfill", (None, 12)), ] PASSES_THROUGH_NAN_NULLS = [ "add", "center", "ljust", "multiply", "replace", "rjust", "strip", "lstrip", "rstrip", "replace" "zfill", ] NULLS_ARE_FALSEY = [ "startswith", "endswith", ] NULLS_ALWAYS_ERROR = [ "count", "find", "rfind", ] SUPPORTS_NULLS = ( PASSES_THROUGH_NAN_NULLS + NULLS_ARE_FALSEY + NULLS_ALWAYS_ERROR ) def call_func(func, args, array, sanitize=True): if args == (None, None): return func(array, array) if args[0] is None: if sanitize: san_args = tuple( np.array(arg, dtype=array.dtype) if isinstance(arg, str) else arg for arg in args[1:] ) else: san_args = args[1:] return func(array, *san_args) if args[1] is None: return func(args[0], array) # shouldn't ever happen assert 0 @pytest.mark.parametrize("function_name, args", BINARY_FUNCTIONS) def test_binary(string_array, unicode_array, function_name, args): if function_name in ONLY_IN_NP_CHAR: func = getattr(np.char, function_name) else: func = getattr(np.strings, function_name) sres = call_func(func, args, string_array) ures = call_func(func, args, unicode_array, sanitize=False) if not isinstance(sres, tuple) and sres.dtype == StringDType(): ures = ures.astype(StringDType()) assert_array_equal(sres, ures) dtype = string_array.dtype if function_name not in SUPPORTS_NULLS or not hasattr(dtype, "na_object"): return na_arr = np.insert(string_array, 0, dtype.na_object) is_nan = np.isnan(np.array([dtype.na_object], dtype=dtype))[0] is_str = isinstance(dtype.na_object, str) should_error = not (is_nan or is_str) if ( (function_name in NULLS_ALWAYS_ERROR and not is_str) or (function_name in PASSES_THROUGH_NAN_NULLS and should_error) or (function_name in NULLS_ARE_FALSEY and should_error) ): with pytest.raises((ValueError, TypeError)): call_func(func, args, na_arr) return res = call_func(func, args, na_arr) if is_str: assert res[0] == call_func(func, args, na_arr[:1]) elif function_name in NULLS_ARE_FALSEY: assert res[0] is np.False_ elif function_name in PASSES_THROUGH_NAN_NULLS: assert res[0] is dtype.na_object else: # shouldn't ever get here assert 0 @pytest.mark.parametrize("function, expected", [ (np.strings.find, [[2, -1], [1, -1]]), (np.strings.startswith, [[False, False], [True, False]])]) @pytest.mark.parametrize("start, stop", [ (1, 4), (np.int8(1), np.int8(4)), (np.array([1, 1], dtype='u2'), np.array([4, 4], dtype='u2'))]) def test_non_default_start_stop(function, start, stop, expected): a = np.array([["--๐Ÿ--", "--๐Ÿฆœ--"], ["-๐Ÿ---", "-๐Ÿฆœ---"]], "T") indx = function(a, "๐Ÿ", start, stop) assert_array_equal(indx, expected) @pytest.mark.parametrize("count", [2, np.int8(2), np.array([2, 2], 'u2')]) def test_replace_non_default_repeat(count): a = np.array(["๐Ÿ--", "๐Ÿฆœ-๐Ÿฆœ-"], "T") result = np.strings.replace(a, "๐Ÿฆœ-", "๐Ÿฆœโ€ ", count) assert_array_equal(result, np.array(["๐Ÿ--", "๐Ÿฆœโ€ ๐Ÿฆœโ€ "], "T")) def test_strip_ljust_rjust_consistency(string_array, unicode_array): rjs = np.char.rjust(string_array, 1000) rju = np.char.rjust(unicode_array, 1000) ljs = np.char.ljust(string_array, 1000) lju = np.char.ljust(unicode_array, 1000) assert_array_equal( np.char.lstrip(rjs), np.char.lstrip(rju).astype(StringDType()), ) assert_array_equal( np.char.rstrip(ljs), np.char.rstrip(lju).astype(StringDType()), ) assert_array_equal( np.char.strip(ljs), np.char.strip(lju).astype(StringDType()), ) assert_array_equal( np.char.strip(rjs), np.char.strip(rju).astype(StringDType()), ) def test_unset_na_coercion(): # a dtype instance with an unset na object is compatible # with a dtype that has one set # this test uses the "add" and "equal" ufunc but all ufuncs that # accept more than one string argument and produce a string should # behave this way # TODO: generalize to more ufuncs inp = ["hello", "world"] arr = np.array(inp, dtype=StringDType(na_object=None)) for op_dtype in [None, StringDType(), StringDType(coerce=False), StringDType(na_object=None)]: if op_dtype is None: op = "2" else: op = np.array("2", dtype=op_dtype) res = arr + op assert_array_equal(res, ["hello2", "world2"]) # dtype instances with distinct explicitly set NA objects are incompatible for op_dtype in [StringDType(na_object=pd_NA), StringDType(na_object="")]: op = np.array("2", dtype=op_dtype) with pytest.raises(TypeError): arr + op # comparisons only consider the na_object for op_dtype in [None, StringDType(), StringDType(coerce=True), StringDType(na_object=None)]: if op_dtype is None: op = inp else: op = np.array(inp, dtype=op_dtype) assert_array_equal(arr, op) for op_dtype in [StringDType(na_object=pd_NA), StringDType(na_object=np.nan)]: op = np.array(inp, dtype=op_dtype) with pytest.raises(TypeError): arr == op def test_repeat(string_array): res = string_array.repeat(1000) # Create an empty array with expanded dimension, and fill it. Then, # reshape it to the expected result. expected = np.empty_like(string_array, shape=string_array.shape + (1000,)) expected[...] = string_array[:, np.newaxis] expected = expected.reshape(-1) assert_array_equal(res, expected, strict=True) @pytest.mark.parametrize("tile", [1, 6, (2, 5)]) def test_accumulation(string_array, tile): """Accumulation is odd for StringDType but tests dtypes with references. """ # Fill with mostly empty strings to not create absurdly big strings arr = np.zeros_like(string_array, shape=(100,)) arr[:len(string_array)] = string_array arr[-len(string_array):] = string_array # Bloat size a bit (get above thresholds and test >1 ndim). arr = np.tile(string_array, tile) res = np.add.accumulate(arr, axis=0) res_obj = np.add.accumulate(arr.astype(object), axis=0) assert_array_equal(res, res_obj.astype(arr.dtype), strict=True) if arr.ndim > 1: res = np.add.accumulate(arr, axis=-1) res_obj = np.add.accumulate(arr.astype(object), axis=-1) assert_array_equal(res, res_obj.astype(arr.dtype), strict=True) class TestImplementation: """Check that strings are stored in the arena when possible. This tests implementation details, so should be adjusted if the implementation changes. """ @classmethod def setup_class(self): self.MISSING = 0x80 self.INITIALIZED = 0x40 self.OUTSIDE_ARENA = 0x20 self.LONG = 0x10 self.dtype = StringDType(na_object=np.nan) self.sizeofstr = self.dtype.itemsize sp = self.dtype.itemsize // 2 # pointer size = sizeof(size_t) # Below, size is not strictly correct, since it really uses # 7 (or 3) bytes, but good enough for the tests here. self.view_dtype = np.dtype([ ('offset', f'u{sp}'), ('size', f'u{sp // 2}'), ('xsiz', f'V{sp // 2 - 1}'), ('size_and_flags', 'u1'), ] if sys.byteorder == 'little' else [ ('size_and_flags', 'u1'), ('xsiz', f'V{sp // 2 - 1}'), ('size', f'u{sp // 2}'), ('offset', f'u{sp}'), ]) self.s_empty = "" self.s_short = "01234" self.s_medium = "abcdefghijklmnopqrstuvwxyz" self.s_long = "-=+" * 100 self.a = np.array( [self.s_empty, self.s_short, self.s_medium, self.s_long], self.dtype) def get_view(self, a): # Cannot view a StringDType as anything else directly, since # it has references. So, we use a stride trick hack. from numpy.lib._stride_tricks_impl import DummyArray interface = dict(a.__array_interface__) interface['descr'] = self.view_dtype.descr interface['typestr'] = self.view_dtype.str return np.asarray(DummyArray(interface, base=a)) def get_flags(self, a): return self.get_view(a)['size_and_flags'] & 0xf0 def is_short(self, a): return self.get_flags(a) == self.INITIALIZED | self.OUTSIDE_ARENA def is_on_heap(self, a): return self.get_flags(a) == (self.INITIALIZED | self.OUTSIDE_ARENA | self.LONG) def is_missing(self, a): return self.get_flags(a) & self.MISSING == self.MISSING def in_arena(self, a): return (self.get_flags(a) & (self.INITIALIZED | self.OUTSIDE_ARENA) == self.INITIALIZED) def test_setup(self): is_short = self.is_short(self.a) length = np.strings.str_len(self.a) assert_array_equal(is_short, (length > 0) & (length <= 15)) assert_array_equal(self.in_arena(self.a), [False, False, True, True]) assert_array_equal(self.is_on_heap(self.a), False) assert_array_equal(self.is_missing(self.a), False) view = self.get_view(self.a) sizes = np.where(is_short, view['size_and_flags'] & 0xf, view['size']) assert_array_equal(sizes, np.strings.str_len(self.a)) assert_array_equal(view['xsiz'][2:], np.void(b'\x00' * (self.sizeofstr // 4 - 1))) # Check that the medium string uses only 1 byte for its length # in the arena, while the long string takes 8 (or 4). offsets = view['offset'] assert offsets[2] == 1 assert offsets[3] == 1 + len(self.s_medium) + self.sizeofstr // 2 def test_empty(self): e = np.empty((3,), self.dtype) assert_array_equal(self.get_flags(e), 0) assert_array_equal(e, "") def test_zeros(self): z = np.zeros((2,), self.dtype) assert_array_equal(self.get_flags(z), 0) assert_array_equal(z, "") def test_copy(self): for c in [self.a.copy(), copy.copy(self.a), copy.deepcopy(self.a)]: assert_array_equal(self.get_flags(c), self.get_flags(self.a)) assert_array_equal(c, self.a) offsets = self.get_view(c)['offset'] assert offsets[2] == 1 assert offsets[3] == 1 + len(self.s_medium) + self.sizeofstr // 2 def test_arena_use_with_setting(self): c = np.zeros_like(self.a) assert_array_equal(self.get_flags(c), 0) c[:] = self.a assert_array_equal(self.get_flags(c), self.get_flags(self.a)) assert_array_equal(c, self.a) def test_arena_reuse_with_setting(self): c = self.a.copy() c[:] = self.a assert_array_equal(self.get_flags(c), self.get_flags(self.a)) assert_array_equal(c, self.a) def test_arena_reuse_after_missing(self): c = self.a.copy() c[:] = np.nan assert np.all(self.is_missing(c)) # Replacing with the original strings, the arena should be reused. c[:] = self.a assert_array_equal(self.get_flags(c), self.get_flags(self.a)) assert_array_equal(c, self.a) def test_arena_reuse_after_empty(self): c = self.a.copy() c[:] = "" assert_array_equal(c, "") # Replacing with the original strings, the arena should be reused. c[:] = self.a assert_array_equal(self.get_flags(c), self.get_flags(self.a)) assert_array_equal(c, self.a) def test_arena_reuse_for_shorter(self): c = self.a.copy() # A string slightly shorter than the shortest in the arena # should be used for all strings in the arena. c[:] = self.s_medium[:-1] assert_array_equal(c, self.s_medium[:-1]) # first empty string in original was never initialized, so # filling it in now leaves it initialized inside the arena. # second string started as a short string so it can never live # in the arena. in_arena = np.array([True, False, True, True]) assert_array_equal(self.in_arena(c), in_arena) # But when a short string is replaced, it will go on the heap. assert_array_equal(self.is_short(c), False) assert_array_equal(self.is_on_heap(c), ~in_arena) # We can put the originals back, and they'll still fit, # and short strings are back as short strings c[:] = self.a assert_array_equal(c, self.a) assert_array_equal(self.in_arena(c), in_arena) assert_array_equal(self.is_short(c), self.is_short(self.a)) assert_array_equal(self.is_on_heap(c), False) def test_arena_reuse_if_possible(self): c = self.a.copy() # A slightly longer string will not fit in the arena for # the medium string, but will fit for the longer one. c[:] = self.s_medium + "ยฑ" assert_array_equal(c, self.s_medium + "ยฑ") in_arena_exp = np.strings.str_len(self.a) >= len(self.s_medium) + 1 # first entry started uninitialized and empty, so filling it leaves # it in the arena in_arena_exp[0] = True assert not np.all(in_arena_exp == self.in_arena(self.a)) assert_array_equal(self.in_arena(c), in_arena_exp) assert_array_equal(self.is_short(c), False) assert_array_equal(self.is_on_heap(c), ~in_arena_exp) # And once outside arena, it stays outside, since offset is lost. # But short strings are used again. c[:] = self.a is_short_exp = self.is_short(self.a) assert_array_equal(c, self.a) assert_array_equal(self.in_arena(c), in_arena_exp) assert_array_equal(self.is_short(c), is_short_exp) assert_array_equal(self.is_on_heap(c), ~in_arena_exp & ~is_short_exp) def test_arena_no_reuse_after_short(self): c = self.a.copy() # If we replace a string with a short string, it cannot # go into the arena after because the offset is lost. c[:] = self.s_short assert_array_equal(c, self.s_short) assert_array_equal(self.in_arena(c), False) c[:] = self.a assert_array_equal(c, self.a) assert_array_equal(self.in_arena(c), False) assert_array_equal(self.is_on_heap(c), self.in_arena(self.a))