import numpy as np import scipy.sparse as sp from scipy import linalg from numpy.testing import (assert_array_almost_equal, assert_array_equal, assert_equal) from sklearn.datasets import make_classification from sklearn.utils.sparsefuncs import (mean_variance_axis, incr_mean_variance_axis, inplace_column_scale, inplace_row_scale, inplace_swap_row, inplace_swap_column, min_max_axis, count_nonzero, csc_median_axis_0) from sklearn.utils.sparsefuncs_fast import assign_rows_csr from sklearn.utils.testing import assert_raises def test_mean_variance_axis0(): X, _ = make_classification(5, 4, random_state=0) # Sparsify the array a little bit X[0, 0] = 0 X[2, 1] = 0 X[4, 3] = 0 X_lil = sp.lil_matrix(X) X_lil[1, 0] = 0 X[1, 0] = 0 X_csr = sp.csr_matrix(X_lil) X_means, X_vars = mean_variance_axis(X_csr, axis=0) assert_array_almost_equal(X_means, np.mean(X, axis=0)) assert_array_almost_equal(X_vars, np.var(X, axis=0)) X_csc = sp.csc_matrix(X_lil) X_means, X_vars = mean_variance_axis(X_csc, axis=0) assert_array_almost_equal(X_means, np.mean(X, axis=0)) assert_array_almost_equal(X_vars, np.var(X, axis=0)) assert_raises(TypeError, mean_variance_axis, X_lil, axis=0) X = X.astype(np.float32) X_csr = X_csr.astype(np.float32) X_csc = X_csr.astype(np.float32) X_means, X_vars = mean_variance_axis(X_csr, axis=0) assert_array_almost_equal(X_means, np.mean(X, axis=0)) assert_array_almost_equal(X_vars, np.var(X, axis=0)) X_means, X_vars = mean_variance_axis(X_csc, axis=0) assert_array_almost_equal(X_means, np.mean(X, axis=0)) assert_array_almost_equal(X_vars, np.var(X, axis=0)) assert_raises(TypeError, mean_variance_axis, X_lil, axis=0) def test_mean_variance_axis1(): X, _ = make_classification(5, 4, random_state=0) # Sparsify the array a little bit X[0, 0] = 0 X[2, 1] = 0 X[4, 3] = 0 X_lil = sp.lil_matrix(X) X_lil[1, 0] = 0 X[1, 0] = 0 X_csr = sp.csr_matrix(X_lil) X_means, X_vars = mean_variance_axis(X_csr, axis=1) assert_array_almost_equal(X_means, np.mean(X, axis=1)) assert_array_almost_equal(X_vars, np.var(X, axis=1)) X_csc = sp.csc_matrix(X_lil) X_means, X_vars = mean_variance_axis(X_csc, axis=1) assert_array_almost_equal(X_means, np.mean(X, axis=1)) assert_array_almost_equal(X_vars, np.var(X, axis=1)) assert_raises(TypeError, mean_variance_axis, X_lil, axis=1) X = X.astype(np.float32) X_csr = X_csr.astype(np.float32) X_csc = X_csr.astype(np.float32) X_means, X_vars = mean_variance_axis(X_csr, axis=1) assert_array_almost_equal(X_means, np.mean(X, axis=1)) assert_array_almost_equal(X_vars, np.var(X, axis=1)) X_means, X_vars = mean_variance_axis(X_csc, axis=1) assert_array_almost_equal(X_means, np.mean(X, axis=1)) assert_array_almost_equal(X_vars, np.var(X, axis=1)) assert_raises(TypeError, mean_variance_axis, X_lil, axis=1) def test_incr_mean_variance_axis(): for axis in [0, 1]: rng = np.random.RandomState(0) n_features = 50 n_samples = 10 data_chunks = [rng.random_integers(0, 1, size=n_features) for i in range(n_samples)] # default params for incr_mean_variance last_mean = np.zeros(n_features) last_var = np.zeros_like(last_mean) last_n = 0 # Test errors X = np.array(data_chunks[0]) X = np.atleast_2d(X) X_lil = sp.lil_matrix(X) X_csr = sp.csr_matrix(X_lil) assert_raises(TypeError, incr_mean_variance_axis, axis, last_mean, last_var, last_n) assert_raises(TypeError, incr_mean_variance_axis, axis, last_mean, last_var, last_n) assert_raises(TypeError, incr_mean_variance_axis, X_lil, axis, last_mean, last_var, last_n) # Test _incr_mean_and_var with a 1 row input X_means, X_vars = mean_variance_axis(X_csr, axis) X_means_incr, X_vars_incr, n_incr = \ incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) # X.shape[axis] picks # samples X_csc = sp.csc_matrix(X_lil) X_means, X_vars = mean_variance_axis(X_csc, axis) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) # Test _incremantal_mean_and_var with whole data X = np.vstack(data_chunks) X_lil = sp.lil_matrix(X) X_csr = sp.csr_matrix(X_lil) X_means, X_vars = mean_variance_axis(X_csr, axis) X_means_incr, X_vars_incr, n_incr = \ incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) X_csc = sp.csc_matrix(X_lil) X_means, X_vars = mean_variance_axis(X_csc, axis) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) # All data but as float X = X.astype(np.float32) X_csr = X_csr.astype(np.float32) X_means, X_vars = mean_variance_axis(X_csr, axis) X_means_incr, X_vars_incr, n_incr = \ incr_mean_variance_axis(X_csr, axis, last_mean, last_var, last_n) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) X_csc = X_csr.astype(np.float32) X_means, X_vars = mean_variance_axis(X_csc, axis) assert_array_almost_equal(X_means, X_means_incr) assert_array_almost_equal(X_vars, X_vars_incr) assert_equal(X.shape[axis], n_incr) def test_mean_variance_illegal_axis(): X, _ = make_classification(5, 4, random_state=0) # Sparsify the array a little bit X[0, 0] = 0 X[2, 1] = 0 X[4, 3] = 0 X_csr = sp.csr_matrix(X) assert_raises(ValueError, mean_variance_axis, X_csr, axis=-3) assert_raises(ValueError, mean_variance_axis, X_csr, axis=2) assert_raises(ValueError, mean_variance_axis, X_csr, axis=-1) assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=-3, last_mean=None, last_var=None, last_n=None) assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=2, last_mean=None, last_var=None, last_n=None) assert_raises(ValueError, incr_mean_variance_axis, X_csr, axis=-1, last_mean=None, last_var=None, last_n=None) def test_densify_rows(): X = sp.csr_matrix([[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64) X_rows = np.array([0, 2, 3], dtype=np.intp) out = np.ones((6, X.shape[1]), dtype=np.float64) out_rows = np.array([1, 3, 4], dtype=np.intp) expect = np.ones_like(out) expect[out_rows] = X[X_rows, :].toarray() assign_rows_csr(X, X_rows, out_rows, out) assert_array_equal(out, expect) def test_inplace_column_scale(): rng = np.random.RandomState(0) X = sp.rand(100, 200, 0.05) Xr = X.tocsr() Xc = X.tocsc() XA = X.toarray() scale = rng.rand(200) XA *= scale inplace_column_scale(Xc, scale) inplace_column_scale(Xr, scale) assert_array_almost_equal(Xr.toarray(), Xc.toarray()) assert_array_almost_equal(XA, Xc.toarray()) assert_array_almost_equal(XA, Xr.toarray()) assert_raises(TypeError, inplace_column_scale, X.tolil(), scale) X = X.astype(np.float32) scale = scale.astype(np.float32) Xr = X.tocsr() Xc = X.tocsc() XA = X.toarray() XA *= scale inplace_column_scale(Xc, scale) inplace_column_scale(Xr, scale) assert_array_almost_equal(Xr.toarray(), Xc.toarray()) assert_array_almost_equal(XA, Xc.toarray()) assert_array_almost_equal(XA, Xr.toarray()) assert_raises(TypeError, inplace_column_scale, X.tolil(), scale) def test_inplace_row_scale(): rng = np.random.RandomState(0) X = sp.rand(100, 200, 0.05) Xr = X.tocsr() Xc = X.tocsc() XA = X.toarray() scale = rng.rand(100) XA *= scale.reshape(-1, 1) inplace_row_scale(Xc, scale) inplace_row_scale(Xr, scale) assert_array_almost_equal(Xr.toarray(), Xc.toarray()) assert_array_almost_equal(XA, Xc.toarray()) assert_array_almost_equal(XA, Xr.toarray()) assert_raises(TypeError, inplace_column_scale, X.tolil(), scale) X = X.astype(np.float32) scale = scale.astype(np.float32) Xr = X.tocsr() Xc = X.tocsc() XA = X.toarray() XA *= scale.reshape(-1, 1) inplace_row_scale(Xc, scale) inplace_row_scale(Xr, scale) assert_array_almost_equal(Xr.toarray(), Xc.toarray()) assert_array_almost_equal(XA, Xc.toarray()) assert_array_almost_equal(XA, Xr.toarray()) assert_raises(TypeError, inplace_column_scale, X.tolil(), scale) def test_inplace_swap_row(): X = np.array([[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64) X_csr = sp.csr_matrix(X) X_csc = sp.csc_matrix(X) swap = linalg.get_blas_funcs(('swap',), (X,)) swap = swap[0] X[0], X[-1] = swap(X[0], X[-1]) inplace_swap_row(X_csr, 0, -1) inplace_swap_row(X_csc, 0, -1) assert_array_equal(X_csr.toarray(), X_csc.toarray()) assert_array_equal(X, X_csc.toarray()) assert_array_equal(X, X_csr.toarray()) X[2], X[3] = swap(X[2], X[3]) inplace_swap_row(X_csr, 2, 3) inplace_swap_row(X_csc, 2, 3) assert_array_equal(X_csr.toarray(), X_csc.toarray()) assert_array_equal(X, X_csc.toarray()) assert_array_equal(X, X_csr.toarray()) assert_raises(TypeError, inplace_swap_row, X_csr.tolil()) X = np.array([[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float32) X_csr = sp.csr_matrix(X) X_csc = sp.csc_matrix(X) swap = linalg.get_blas_funcs(('swap',), (X,)) swap = swap[0] X[0], X[-1] = swap(X[0], X[-1]) inplace_swap_row(X_csr, 0, -1) inplace_swap_row(X_csc, 0, -1) assert_array_equal(X_csr.toarray(), X_csc.toarray()) assert_array_equal(X, X_csc.toarray()) assert_array_equal(X, X_csr.toarray()) X[2], X[3] = swap(X[2], X[3]) inplace_swap_row(X_csr, 2, 3) inplace_swap_row(X_csc, 2, 3) assert_array_equal(X_csr.toarray(), X_csc.toarray()) assert_array_equal(X, X_csc.toarray()) assert_array_equal(X, X_csr.toarray()) assert_raises(TypeError, inplace_swap_row, X_csr.tolil()) def test_inplace_swap_column(): X = np.array([[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64) X_csr = sp.csr_matrix(X) X_csc = sp.csc_matrix(X) swap = linalg.get_blas_funcs(('swap',), (X,)) swap = swap[0] X[:, 0], X[:, -1] = swap(X[:, 0], X[:, -1]) inplace_swap_column(X_csr, 0, -1) inplace_swap_column(X_csc, 0, -1) assert_array_equal(X_csr.toarray(), X_csc.toarray()) assert_array_equal(X, X_csc.toarray()) assert_array_equal(X, X_csr.toarray()) X[:, 0], X[:, 1] = swap(X[:, 0], X[:, 1]) inplace_swap_column(X_csr, 0, 1) inplace_swap_column(X_csc, 0, 1) assert_array_equal(X_csr.toarray(), X_csc.toarray()) assert_array_equal(X, X_csc.toarray()) assert_array_equal(X, X_csr.toarray()) assert_raises(TypeError, inplace_swap_column, X_csr.tolil()) X = np.array([[0, 3, 0], [2, 4, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float32) X_csr = sp.csr_matrix(X) X_csc = sp.csc_matrix(X) swap = linalg.get_blas_funcs(('swap',), (X,)) swap = swap[0] X[:, 0], X[:, -1] = swap(X[:, 0], X[:, -1]) inplace_swap_column(X_csr, 0, -1) inplace_swap_column(X_csc, 0, -1) assert_array_equal(X_csr.toarray(), X_csc.toarray()) assert_array_equal(X, X_csc.toarray()) assert_array_equal(X, X_csr.toarray()) X[:, 0], X[:, 1] = swap(X[:, 0], X[:, 1]) inplace_swap_column(X_csr, 0, 1) inplace_swap_column(X_csc, 0, 1) assert_array_equal(X_csr.toarray(), X_csc.toarray()) assert_array_equal(X, X_csc.toarray()) assert_array_equal(X, X_csr.toarray()) assert_raises(TypeError, inplace_swap_column, X_csr.tolil()) def test_min_max_axis0(): X = np.array([[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64) X_csr = sp.csr_matrix(X) X_csc = sp.csc_matrix(X) mins_csr, maxs_csr = min_max_axis(X_csr, axis=0) assert_array_equal(mins_csr, X.min(axis=0)) assert_array_equal(maxs_csr, X.max(axis=0)) mins_csc, maxs_csc = min_max_axis(X_csc, axis=0) assert_array_equal(mins_csc, X.min(axis=0)) assert_array_equal(maxs_csc, X.max(axis=0)) X = X.astype(np.float32) X_csr = sp.csr_matrix(X) X_csc = sp.csc_matrix(X) mins_csr, maxs_csr = min_max_axis(X_csr, axis=0) assert_array_equal(mins_csr, X.min(axis=0)) assert_array_equal(maxs_csr, X.max(axis=0)) mins_csc, maxs_csc = min_max_axis(X_csc, axis=0) assert_array_equal(mins_csc, X.min(axis=0)) assert_array_equal(maxs_csc, X.max(axis=0)) def test_min_max_axis1(): X = np.array([[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64) X_csr = sp.csr_matrix(X) X_csc = sp.csc_matrix(X) mins_csr, maxs_csr = min_max_axis(X_csr, axis=1) assert_array_equal(mins_csr, X.min(axis=1)) assert_array_equal(maxs_csr, X.max(axis=1)) mins_csc, maxs_csc = min_max_axis(X_csc, axis=1) assert_array_equal(mins_csc, X.min(axis=1)) assert_array_equal(maxs_csc, X.max(axis=1)) X = X.astype(np.float32) X_csr = sp.csr_matrix(X) X_csc = sp.csc_matrix(X) mins_csr, maxs_csr = min_max_axis(X_csr, axis=1) assert_array_equal(mins_csr, X.min(axis=1)) assert_array_equal(maxs_csr, X.max(axis=1)) mins_csc, maxs_csc = min_max_axis(X_csc, axis=1) assert_array_equal(mins_csc, X.min(axis=1)) assert_array_equal(maxs_csc, X.max(axis=1)) def test_min_max_axis_errors(): X = np.array([[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64) X_csr = sp.csr_matrix(X) X_csc = sp.csc_matrix(X) assert_raises(TypeError, min_max_axis, X_csr.tolil(), axis=0) assert_raises(ValueError, min_max_axis, X_csr, axis=2) assert_raises(ValueError, min_max_axis, X_csc, axis=-3) def test_count_nonzero(): X = np.array([[0, 3, 0], [2, -1, 0], [0, 0, 0], [9, 8, 7], [4, 0, 5]], dtype=np.float64) X_csr = sp.csr_matrix(X) X_csc = sp.csc_matrix(X) X_nonzero = X != 0 sample_weight = [.5, .2, .3, .1, .1] X_nonzero_weighted = X_nonzero * np.array(sample_weight)[:, None] for axis in [0, 1, -1, -2, None]: assert_array_almost_equal(count_nonzero(X_csr, axis=axis), X_nonzero.sum(axis=axis)) assert_array_almost_equal(count_nonzero(X_csr, axis=axis, sample_weight=sample_weight), X_nonzero_weighted.sum(axis=axis)) assert_raises(TypeError, count_nonzero, X_csc) assert_raises(ValueError, count_nonzero, X_csr, axis=2) def test_csc_row_median(): # Test csc_row_median actually calculates the median. # Test that it gives the same output when X is dense. rng = np.random.RandomState(0) X = rng.rand(100, 50) dense_median = np.median(X, axis=0) csc = sp.csc_matrix(X) sparse_median = csc_median_axis_0(csc) assert_array_equal(sparse_median, dense_median) # Test that it gives the same output when X is sparse X = rng.rand(51, 100) X[X < 0.7] = 0.0 ind = rng.randint(0, 50, 10) X[ind] = -X[ind] csc = sp.csc_matrix(X) dense_median = np.median(X, axis=0) sparse_median = csc_median_axis_0(csc) assert_array_equal(sparse_median, dense_median) # Test for toy data. X = [[0, -2], [-1, -1], [1, 0], [2, 1]] csc = sp.csc_matrix(X) assert_array_equal(csc_median_axis_0(csc), np.array([0.5, -0.5])) X = [[0, -2], [-1, -5], [1, -3]] csc = sp.csc_matrix(X) assert_array_equal(csc_median_axis_0(csc), np.array([0., -3])) # Test that it raises an Error for non-csc matrices. assert_raises(TypeError, csc_median_axis_0, sp.csr_matrix(X))