Here are the examples of the python api numpy.nan taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.
171 Examples
0
Example 51
Project: AlephNull Source File: test_events_through_risk.py
def test_daily_buy_and_hold(self):
start_date = datetime.datetime(
year=2006,
month=1,
day=3,
hour=0,
minute=0,
tzinfo=pytz.utc)
end_date = datetime.datetime(
year=2006,
month=1,
day=5,
hour=0,
minute=0,
tzinfo=pytz.utc)
sim_params = SimulationParameters(
period_start=start_date,
period_end=end_date,
emission_rate='daily'
)
algo = BuyAndHoldAlgorithm(
sim_params=sim_params,
data_frequency='daily')
first_date = datetime.datetime(2006, 1, 3, tzinfo=pytz.utc)
second_date = datetime.datetime(2006, 1, 4, tzinfo=pytz.utc)
third_date = datetime.datetime(2006, 1, 5, tzinfo=pytz.utc)
trade_bar_data = [
Event({
'open_price': 10,
'close_price': 15,
'price': 15,
'volume': 1000,
'sid': 1,
'dt': first_date,
'source_id': 'test-trade-source',
'type': DATASOURCE_TYPE.TRADE
}),
Event({
'open_price': 15,
'close_price': 20,
'price': 20,
'volume': 2000,
'sid': 1,
'dt': second_date,
'source_id': 'test_list',
'type': DATASOURCE_TYPE.TRADE
}),
Event({
'open_price': 20,
'close_price': 15,
'price': 15,
'volume': 1000,
'sid': 1,
'dt': third_date,
'source_id': 'test_list',
'type': DATASOURCE_TYPE.TRADE
}),
]
benchmark_data = [
Event({
'returns': 0.1,
'dt': first_date,
'source_id': 'test-benchmark-source',
'type': DATASOURCE_TYPE.BENCHMARK
}),
Event({
'returns': 0.2,
'dt': second_date,
'source_id': 'test-benchmark-source',
'type': DATASOURCE_TYPE.BENCHMARK
}),
Event({
'returns': 0.4,
'dt': third_date,
'source_id': 'test-benchmark-source',
'type': DATASOURCE_TYPE.BENCHMARK
}),
]
algo.benchmark_return_source = benchmark_data
algo.sources = list([trade_bar_data])
gen = algo._create_generator(sim_params)
# TODO: Hand derive these results.
# Currently, the output from the time of this writing to
# at least be an early warning against changes.
expected_algorithm_returns = {
first_date: 0.0,
second_date: -0.000350,
third_date: -0.050018
}
# TODO: Hand derive these results.
# Currently, the output from the time of this writing to
# at least be an early warning against changes.
expected_sharpe = {
first_date: np.nan,
second_date: -31.56903265,
third_date: -11.459888981,
}
for bar in gen:
current_dt = algo.datetime
crm = algo.perf_tracker.cuemulative_risk_metrics
np.testing.assert_almost_equal(
crm.algorithm_returns[current_dt],
expected_algorithm_returns[current_dt],
decimal=6)
np.testing.assert_almost_equal(
crm.metrics.sharpe[current_dt],
expected_sharpe[current_dt],
decimal=6,
err_msg="Mismatch at %s" % (current_dt,))
0
Example 52
Project: finmarketpy Source File: eventstudy.py
def get_intraday_moves_over_custom_event(self, data_frame_rets, ef_time_frame, vol=False,
minute_start = 5, mins = 3 * 60, min_offset = 0 , create_index = False,
resample = False, freq = 'minutes'):
filter = Filter()
ef_time_frame = filter.filter_time_series_by_date(data_frame_rets.index[0], data_frame_rets.index[-1], ef_time_frame)
ef_time = ef_time_frame.index
if freq == 'minutes':
ef_time_start = ef_time - timedelta(minutes = minute_start)
ef_time_end = ef_time + timedelta(minutes = mins)
ann_factor = 252 * 1440
elif freq == 'days':
ef_time = ef_time_frame.index.normalize()
ef_time_start = ef_time - timedelta(days = minute_start)
ef_time_end = ef_time + timedelta(days = mins)
ann_factor = 252
ords = range(-minute_start + min_offset, mins + min_offset)
# all data needs to be equally spaced
if resample:
# make sure time series is properly sampled at 1 min intervals
data_frame_rets = data_frame_rets.resample('1min')
data_frame_rets = data_frame_rets.fillna(value = 0)
data_frame_rets = filter.remove_out_FX_out_of_hours(data_frame_rets)
data_frame_rets['Ind'] = numpy.nan
start_index = data_frame_rets.index.searchsorted(ef_time_start)
finish_index = data_frame_rets.index.searchsorted(ef_time_end)
# not all observation windows will be same length (eg. last one?)
# fill the indices which represent minutes
# TODO vectorise this!
for i in range(0, len(ef_time_frame.index)):
try:
data_frame_rets.ix[start_index[i]:finish_index[i], 'Ind'] = ords
except:
data_frame_rets.ix[start_index[i]:finish_index[i], 'Ind'] = ords[0:(finish_index[i] - start_index[i])]
# set the release dates
data_frame_rets.ix[start_index,'Rel'] = ef_time # set entry points
data_frame_rets.ix[finish_index + 1,'Rel'] = numpy.zeros(len(start_index)) # set exit points
data_frame_rets['Rel'] = data_frame_rets['Rel'].fillna(method = 'pad') # fill down signals
data_frame_rets = data_frame_rets[pandas.notnull(data_frame_rets['Ind'])] # get rid of other
data_frame = data_frame_rets.pivot(index='Ind',
columns='Rel', values=data_frame_rets.columns[0])
data_frame.index.names = [None]
if create_index:
calculations = Calculations()
data_frame.ix[-minute_start + min_offset,:] = numpy.nan
data_frame = calculations.create_mult_index(data_frame)
else:
if vol is True:
# annualise (if vol)
data_frame = data_frame.rolling(center=False,window=5).std() * math.sqrt(ann_factor)
else:
data_frame = data_frame.cuemsum()
return data_frame
0
Example 53
@skipif(WIN32)
def test_replace_missing(self):
x = self.x.copy()
x[::5, ::7] = np.nan
pc = PCA(x, missing='drop-row')
x_dropped_row = x[np.logical_not(np.any(np.isnan(x), 1))]
pc_dropped = PCA(x_dropped_row)
assert_equal(pc.projection, pc_dropped.projection)
assert_equal(x, pc.data)
pc = PCA(x, missing='drop-col')
x_dropped_col = x[:, np.logical_not(np.any(np.isnan(x), 0))]
pc_dropped = PCA(x_dropped_col)
assert_equal(pc.projection, pc_dropped.projection)
assert_equal(x, pc.data)
pc = PCA(x, missing='drop-min')
if x_dropped_row.size > x_dropped_col.size:
x_dropped_min = x_dropped_row
else:
x_dropped_min = x_dropped_col
pc_dropped = PCA(x_dropped_min)
assert_equal(pc.projection, pc_dropped.projection)
assert_equal(x, pc.data)
pc = PCA(x, ncomp=3, missing='fill-em')
missing = np.isnan(x)
mu = nanmean(x, axis=0)
errors = x - mu
sigma = np.sqrt(nanmean(errors ** 2, axis=0))
x_std = errors / sigma
x_std[missing] = 0.0
last = x_std[missing]
delta = 1.0
count = 0
while delta > 5e-8:
pc_temp = PCA(x_std, ncomp=3, standardize=False, demean=False)
x_std[missing] = pc_temp.projection[missing]
current = x_std[missing]
diff = current - last
delta = np.sqrt(np.sum(diff ** 2)) / np.sqrt(np.sum(current ** 2))
last = current
count += 1
x = self.x + 0.0
projection = pc_temp.projection * sigma + mu
x[missing] = projection[missing]
assert_allclose(pc._adjusted_data, x)
# Check data for no changes
assert_equal(self.x, self.x_copy)
x = self.x
pc = PCA(x)
pc_dropped = PCA(x, missing='drop-row')
assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)
pc_dropped = PCA(x, missing='drop-col')
assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)
pc_dropped = PCA(x, missing='drop-min')
assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)
pc = PCA(x, ncomp=3)
pc_dropped = PCA(x, ncomp=3, missing='fill-em')
assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)
# Test too many missing for missing='fill-em'
x = self.x.copy()
x[:, :] = np.nan
assert_raises(ValueError, PCA, x, missing='drop-row')
assert_raises(ValueError, PCA, x, missing='drop-col')
assert_raises(ValueError, PCA, x, missing='drop-min')
assert_raises(ValueError, PCA, x, missing='fill-em')
0
Example 54
Project: tia Source File: plots.py
def plot_return_on_dollar(rets, title='Return on $1', show_maxdd=0, figsize=None, ax=None, append=0, label=None, **plot_args):
""" Show the cuemulative return of specified rets and max drawdowns if selected."""
crets = (1. + returns_cuemulative(rets, expanding=1))
if isinstance(crets, pd.DataFrame):
tmp = crets.copy()
for c in tmp.columns:
s = tmp[c]
fv = s.first_valid_index()
fi = s.index.get_loc(fv)
if fi != 0:
tmp.ix[fi - 1, c] = 1.
else:
if not s.index.freq:
# no frequency set
freq = guess_freq(s.index)
s = s.asfreq(freq)
first = s.index.shift(-1)[0]
tmp = pd.concat([pd.DataFrame({c: [1.]}, index=[first]), tmp])
crets = tmp
if append:
toadd = crets.index.shift(1)[-1]
crets = pd.concat([crets, pd.DataFrame(np.nan, columns=crets.columns, index=[toadd])])
else:
fv = crets.first_valid_index()
fi = crets.index.get_loc(fv)
if fi != 0:
crets = crets.copy()
crets.iloc[fi - 1] = 1.
else:
if not crets.index.freq:
first = crets.asfreq(guess_freq(crets.index)).index.shift(-1)[0]
else:
first = crets.index.shift(-1)[0]
tmp = pd.Series([1.], index=[first])
tmp = tmp.append(crets)
crets = tmp
if append:
toadd = pd.Series(np.nan, index=[crets.index.shift(1)[-1]])
crets = crets.append(toadd)
ax = crets.plot(figsize=figsize, title=title, ax=ax, label=label, **plot_args)
AxesFormat().Y.apply_format(new_float_formatter()).X.label("").apply(ax)
#ax.tick_params(labelsize=14)
if show_maxdd:
# find the max drawdown available by using original rets
if isinstance(rets, pd.DataFrame):
iterator = rets.iteritems()
else:
iterator = iter([('', rets)])
for c, col in iterator:
dd, dt = max_drawdown(col, inc_date=1)
lbl = c and c + ' maxdd' or 'maxdd'
# get cret to place annotation correctly
if isinstance(crets, pd.DataFrame):
amt = crets.ix[dt, c]
else:
amt = crets[dt]
bbox_props = dict(boxstyle="round", fc="w", ec="0.5", alpha=0.7)
# sub = lambda c: c and len(c) > 2 and c[:2] or c
try:
dtstr = '{0}'.format(dt.to_period())
except:
dtstr = '{0}'.format(dt)
ax.text(dt, amt, "mdd {0}".format(dtstr).strip(), ha="center",
va="center", size=10, bbox=bbox_props)
plt.tight_layout()
0
Example 55
def synthetic_data(desired_auc, score_range, num_records, rng, frac_true):
"""Create synthetic boolean_labels and scores with adjustable auc.
Args:
desired_auc: Number in [0, 1], the theoretical AUC of resultant data.
score_range: 2-tuple, (low, high), giving the range of the resultant scores
num_records: Positive integer. The number of records to return.
rng: Initialized np.random.RandomState random number generator
frac_true: Number in (0, 1). Expected fraction of resultant labels that
will be True. This is just in expectation...more or less may actually be
True.
Returns:
boolean_labels: np.array, dtype=bool.
scores: np.array, dtype=np.float32
"""
# We prove here why the method (below) for computing AUC works. Of course we
# also checked this against sklearn.metrics.roc_auc_curve.
#
# First do this for score_range = [0, 1], then rescale.
# WLOG assume AUC >= 0.5, otherwise we will solve for AUC >= 0.5 then swap
# the labels.
# So for AUC in [0, 1] we create False and True labels
# and corresponding scores drawn from:
# F ~ U[0, 1], T ~ U[x, 1]
# We have,
# AUC
# = P[T > F]
# = P[T > F | F < x] P[F < x] + P[T > F | F > x] P[F > x]
# = (1 * x) + (0.5 * (1 - x)).
# Inverting, we have:
# x = 2 * AUC - 1, when AUC >= 0.5.
assert 0 <= desired_auc <= 1
assert 0 < frac_true < 1
if desired_auc < 0.5:
flip_labels = True
desired_auc = 1 - desired_auc
frac_true = 1 - frac_true
else:
flip_labels = False
x = 2 * desired_auc - 1
labels = rng.binomial(1, frac_true, size=num_records).astype(bool)
num_true = labels.sum()
num_false = num_records - labels.sum()
# Draw F ~ U[0, 1], and T ~ U[x, 1]
false_scores = rng.rand(num_false)
true_scores = x + rng.rand(num_true) * (1 - x)
# Reshape [0, 1] to score_range.
def reshape(scores):
return score_range[0] + scores * (score_range[1] - score_range[0])
false_scores = reshape(false_scores)
true_scores = reshape(true_scores)
# Place into one array corresponding with the labels.
scores = np.nan * np.ones(num_records, dtype=np.float32)
scores[labels] = true_scores
scores[~labels] = false_scores
if flip_labels:
labels = ~labels
return labels, scores
0
Example 56
def plotGrid(self, ax=None, nodes=False, faces=False, centers=False, edges=False, lines=True, showIt=False):
"""Plot the nodal, cell-centered and staggered grids for 1,2 and 3 dimensions.
.. plot::
:include-source:
from SimPEG import Mesh, Utils
X, Y = Utils.exampleLrmGrid([3,3],'rotate')
M = Mesh.CurvilinearMesh([X, Y])
M.plotGrid(showIt=True)
"""
import matplotlib.pyplot as plt
import matplotlib
from mpl_toolkits.mplot3d import Axes3D
axOpts = {'projection':'3d'} if self.dim == 3 else {}
if ax is None: ax = plt.subplot(111, **axOpts)
NN = self.r(self.gridN, 'N', 'N', 'M')
if self.dim == 2:
if lines:
X1 = np.c_[mkvc(NN[0][:-1, :]), mkvc(NN[0][1:, :]), mkvc(NN[0][:-1, :])*np.nan].flatten()
Y1 = np.c_[mkvc(NN[1][:-1, :]), mkvc(NN[1][1:, :]), mkvc(NN[1][:-1, :])*np.nan].flatten()
X2 = np.c_[mkvc(NN[0][:, :-1]), mkvc(NN[0][:, 1:]), mkvc(NN[0][:, :-1])*np.nan].flatten()
Y2 = np.c_[mkvc(NN[1][:, :-1]), mkvc(NN[1][:, 1:]), mkvc(NN[1][:, :-1])*np.nan].flatten()
X = np.r_[X1, X2]
Y = np.r_[Y1, Y2]
ax.plot(X, Y, 'b-')
if centers:
ax.plot(self.gridCC[:,0],self.gridCC[:,1],'ro')
# Nx = self.r(self.normals, 'F', 'Fx', 'V')
# Ny = self.r(self.normals, 'F', 'Fy', 'V')
# Tx = self.r(self.tangents, 'E', 'Ex', 'V')
# Ty = self.r(self.tangents, 'E', 'Ey', 'V')
# ax.plot(self.gridN[:, 0], self.gridN[:, 1], 'bo')
# nX = np.c_[self.gridFx[:, 0], self.gridFx[:, 0] + Nx[0]*length, self.gridFx[:, 0]*np.nan].flatten()
# nY = np.c_[self.gridFx[:, 1], self.gridFx[:, 1] + Nx[1]*length, self.gridFx[:, 1]*np.nan].flatten()
# ax.plot(self.gridFx[:, 0], self.gridFx[:, 1], 'rs')
# ax.plot(nX, nY, 'r-')
# nX = np.c_[self.gridFy[:, 0], self.gridFy[:, 0] + Ny[0]*length, self.gridFy[:, 0]*np.nan].flatten()
# nY = np.c_[self.gridFy[:, 1], self.gridFy[:, 1] + Ny[1]*length, self.gridFy[:, 1]*np.nan].flatten()
# #ax.plot(self.gridFy[:, 0], self.gridFy[:, 1], 'gs')
# ax.plot(nX, nY, 'g-')
# tX = np.c_[self.gridEx[:, 0], self.gridEx[:, 0] + Tx[0]*length, self.gridEx[:, 0]*np.nan].flatten()
# tY = np.c_[self.gridEx[:, 1], self.gridEx[:, 1] + Tx[1]*length, self.gridEx[:, 1]*np.nan].flatten()
# ax.plot(self.gridEx[:, 0], self.gridEx[:, 1], 'r^')
# ax.plot(tX, tY, 'r-')
# nX = np.c_[self.gridEy[:, 0], self.gridEy[:, 0] + Ty[0]*length, self.gridEy[:, 0]*np.nan].flatten()
# nY = np.c_[self.gridEy[:, 1], self.gridEy[:, 1] + Ty[1]*length, self.gridEy[:, 1]*np.nan].flatten()
# #ax.plot(self.gridEy[:, 0], self.gridEy[:, 1], 'g^')
# ax.plot(nX, nY, 'g-')
elif self.dim == 3:
X1 = np.c_[mkvc(NN[0][:-1, :, :]), mkvc(NN[0][1:, :, :]), mkvc(NN[0][:-1, :, :])*np.nan].flatten()
Y1 = np.c_[mkvc(NN[1][:-1, :, :]), mkvc(NN[1][1:, :, :]), mkvc(NN[1][:-1, :, :])*np.nan].flatten()
Z1 = np.c_[mkvc(NN[2][:-1, :, :]), mkvc(NN[2][1:, :, :]), mkvc(NN[2][:-1, :, :])*np.nan].flatten()
X2 = np.c_[mkvc(NN[0][:, :-1, :]), mkvc(NN[0][:, 1:, :]), mkvc(NN[0][:, :-1, :])*np.nan].flatten()
Y2 = np.c_[mkvc(NN[1][:, :-1, :]), mkvc(NN[1][:, 1:, :]), mkvc(NN[1][:, :-1, :])*np.nan].flatten()
Z2 = np.c_[mkvc(NN[2][:, :-1, :]), mkvc(NN[2][:, 1:, :]), mkvc(NN[2][:, :-1, :])*np.nan].flatten()
X3 = np.c_[mkvc(NN[0][:, :, :-1]), mkvc(NN[0][:, :, 1:]), mkvc(NN[0][:, :, :-1])*np.nan].flatten()
Y3 = np.c_[mkvc(NN[1][:, :, :-1]), mkvc(NN[1][:, :, 1:]), mkvc(NN[1][:, :, :-1])*np.nan].flatten()
Z3 = np.c_[mkvc(NN[2][:, :, :-1]), mkvc(NN[2][:, :, 1:]), mkvc(NN[2][:, :, :-1])*np.nan].flatten()
X = np.r_[X1, X2, X3]
Y = np.r_[Y1, Y2, Y3]
Z = np.r_[Z1, Z2, Z3]
ax.plot(X, Y, 'b', zs=Z)
ax.set_zlabel('x3')
ax.grid(True)
ax.set_xlabel('x1')
ax.set_ylabel('x2')
if showIt:
plt.show()
0
Example 57
def charpath(D, include_diagonal=False, include_infinite=True):
'''
The characteristic path length is the average shortest path length in
the network. The global efficiency is the average inverse shortest path
length in the network.
Parameters
----------
D : NxN np.ndarray
distance matrix
include_diagonal : bool
If True, include the weights on the diagonal. Default value is False.
include_infinite : bool
If True, include infinite distances in calculation
Returns
-------
lambda : float
characteristic path length
efficiency : float
global efficiency
ecc : Nx1 np.ndarray
eccentricity at each vertex
radius : float
radius of graph
diameter : float
diameter of graph
Notes
-----
The input distance matrix may be obtained with any of the distance
functions, e.g. distance_bin, distance_wei.
Characteristic path length is calculated as the global mean of
the distance matrix D, excludings any 'Infs' but including distances on
the main diagonal.
'''
D = D.copy()
if not include_diagonal:
np.fill_diagonal(D, np.nan)
if not include_infinite:
D[np.isinf(D)] = np.nan
Dv = D[np.logical_not(np.isnan(D))].ravel()
# mean of finite entries of D[G]
lambda_ = np.mean(Dv)
# efficiency: mean of inverse entries of D[G]
efficiency = np.mean(1 / Dv)
# eccentricity for each vertex (ignore inf)
ecc = np.array(np.ma.masked_equal(D, np.nan).max(axis=1))
# radius of graph
radius = np.min(ecc) # but what about zeros?
# diameter of graph
diameter = np.max(ecc)
return lambda_, efficiency, ecc, radius, diameter
0
Example 58
Project: pysystemtrade Source File: test_Accounting.py
def test_get_trades_from_positions(self):
positions = pd.DataFrame(
[np.nan, 2, 3, np.nan, 2, 3, 3.1, 4, 3, 5, 7],
dt_range2)
price = pd.DataFrame(
[100, 103, np.nan, 106, 110, 105, np.nan, 106, 120, np.nan, 142],
dt_range2)
# test delayed fill
delayfill = True
roundpositions = True
get_daily_returns_volatility = None
forecast = None
fx = None
value_of_price_point = None
trades = get_trades_from_positions(price,
positions,
delayfill,
roundpositions,
get_daily_returns_volatility,
forecast,
fx,
value_of_price_point)
np.testing.assert_almost_equal(
trades.trades,
[2.0, 1.0, -1.0, 1.0, 1.0, -1.0, 2.0, 2.0])
np.testing.assert_almost_equal(
trades.fill_price[:-1],
[106.0, 106.0, 105.0, 106.0, 120.0, 142.0, 142.0])
# test none delayed fill
delayfill = False
trades = get_trades_from_positions(price,
positions,
delayfill,
roundpositions,
get_daily_returns_volatility,
forecast,
fx,
value_of_price_point)
np.testing.assert_almost_equal(
trades.trades,
[2.0, 1.0, -1.0, 1.0, 1.0, -1.0, 2.0, 2.0])
np.testing.assert_almost_equal(
trades.fill_price,
[103.0, 106.0, 110.0, 105.0, 106.0, 120.0, 142.0, 142.0])
# test roundpositions
delayfill = True
roundpositions = False
trades = get_trades_from_positions(price,
positions,
delayfill,
roundpositions,
get_daily_returns_volatility,
forecast,
fx,
value_of_price_point)
np.testing.assert_almost_equal(
trades.trades,
[2.0, 1.0, -1.0, 1.0, 0.1, 0.9, -1.0, 2.0, 2.0])
np.testing.assert_almost_equal(
trades.fill_price[:-1],
[106.0, 106.0, 105.0, 106.0, 106.0, 120.0, 142.0, 142.0])
0
Example 59
def test_pairwise(self):
df = pd.DataFrame(index=list(range(10)))
for x in range(3):
df[x] = list(range(x, x+10))
nandf = df.copy().astype(float)
nandf.ix[9:,1] = np.nan
# test with order=True
# test with permutations
pairs = pairwise(df, lambda x, y: x.sum() - y.sum())
expected = pd.DataFrame([[0, -10, -20],
[10, 0, -10],
[20, 10, 0]], index=list(range(3)), dtype=float)
tm.assert_frame_equal(pairs, expected)
# test with combinations
pairs = pairwise(df, lambda x, y: x.sum() - y.sum(), order=False)
expected = pd.DataFrame([[0, -10, -20],
[-10, 0, -10],
[-20, -10, 0]], index=list(range(3)), dtype=float)
tm.assert_frame_equal(pairs, expected)
# test with combinations and values
# use nandf to test. np.ndarray.sum() returns NaN if it contains nan
pairs = pairwise(nandf, lambda x, y: x.sum() - y.sum(), order=True,
force_values=True)
expected = pd.DataFrame([[0, np.nan, -20],
[np.nan, np.nan, np.nan],
[20, np.nan, 0]], index=list(range(3)), dtype=float)
tm.assert_frame_equal(pairs, expected)
# test with np.nansum.
pairs = pairwise(nandf, lambda x, y: np.nansum(x) - np.nansum(y),
order=True, force_values=True)
expected = pd.DataFrame([[0, 0, -20],
[0, 0, -20],
[20, 20, 0]], index=list(range(3)), dtype=float)
tm.assert_frame_equal(pairs, expected)
# the np.nansum version should be same as Series.sum version
pairs_series = pairwise(nandf, lambda x, y: x.sum() - y.sum(),
order=True, force_values=False)
tm.assert_frame_equal(pairs, pairs_series)
0
Example 60
def estimate(self, src, dst):
"""Estimate the transformation from a set of corresponding points.
You can determine the over-, well- and under-determined parameters
with the total least-squares method.
Number of source and destination coordinates must match.
The transformation is defined as::
X = (a0*x + a1*y + a2) / (c0*x + c1*y + 1)
Y = (b0*x + b1*y + b2) / (c0*x + c1*y + 1)
These equations can be transformed to the following form::
0 = a0*x + a1*y + a2 - c0*x*X - c1*y*X - X
0 = b0*x + b1*y + b2 - c0*x*Y - c1*y*Y - Y
which exist for each set of corresponding points, so we have a set of
N * 2 equations. The coefficients appear linearly so we can write
A x = 0, where::
A = [[x y 1 0 0 0 -x*X -y*X -X]
[0 0 0 x y 1 -x*Y -y*Y -Y]
...
...
]
x.T = [a0 a1 a2 b0 b1 b2 c0 c1 c3]
In case of total least-squares the solution of this humogeneous system
of equations is the right singular vector of A which corresponds to the
smallest singular value normed by the coefficient c3.
In case of the affine transformation the coefficients c0 and c1 are 0.
Thus the system of equations is::
A = [[x y 1 0 0 0 -X]
[0 0 0 x y 1 -Y]
...
...
]
x.T = [a0 a1 a2 b0 b1 b2 c3]
Parameters
----------
src : (N, 2) array
Source coordinates.
dst : (N, 2) array
Destination coordinates.
Returns
-------
success : bool
True, if model estimation succeeds.
"""
try:
src_matrix, src = _center_and_normalize_points(src)
dst_matrix, dst = _center_and_normalize_points(dst)
except ZeroDivisionError:
self.params = np.nan * np.empty((3, 3))
return False
xs = src[:, 0]
ys = src[:, 1]
xd = dst[:, 0]
yd = dst[:, 1]
rows = src.shape[0]
# params: a0, a1, a2, b0, b1, b2, c0, c1
A = np.zeros((rows * 2, 9))
A[:rows, 0] = xs
A[:rows, 1] = ys
A[:rows, 2] = 1
A[:rows, 6] = - xd * xs
A[:rows, 7] = - xd * ys
A[rows:, 3] = xs
A[rows:, 4] = ys
A[rows:, 5] = 1
A[rows:, 6] = - yd * xs
A[rows:, 7] = - yd * ys
A[:rows, 8] = xd
A[rows:, 8] = yd
# Select relevant columns, depending on params
A = A[:, list(self._coeffs) + [8]]
_, _, V = np.linalg.svd(A)
H = np.zeros((3, 3))
# solution is right singular vector that corresponds to smallest
# singular value
H.flat[list(self._coeffs) + [8]] = - V[-1, :-1] / V[-1, -1]
H[2, 2] = 1
# De-center and de-normalize
H = np.dot(np.linalg.inv(dst_matrix), np.dot(H, src_matrix))
self.params = H
return True
0
Example 61
Project: bokeh Source File: test_comp_glyphs.py
def test_xyglyph_xy_range():
def check_bounds(xyg, xmin=0, xmax=4, ymin=1, ymax=5):
assert xyg.x_min == xmin
assert xyg.x_max == xmax
assert xyg.y_min == ymin
assert xyg.y_max == ymax
for Glyph in [LineGlyph, PointGlyph]:
x = pd.Series([0, 1, 2, 3, 4])
y = pd.Series([5, 4, 3, 2, 1])
xyg = Glyph(x=x, y=y)
check_bounds(xyg)
x[1] = x[2] = np.nan
xyg = Glyph(x=x, y=y)
check_bounds(xyg)
x[0] = np.nan
xyg = Glyph(x=x, y=y)
check_bounds(xyg, xmin=3)
x[4] = np.nan
xyg = Glyph(x=x, y=y)
check_bounds(xyg, xmin=3, xmax=3)
y[1] = y[2] = np.nan
xyg = Glyph(x=x, y=y)
check_bounds(xyg, xmin=3, xmax=3)
y[0] = np.nan
xyg = Glyph(x=x, y=y)
check_bounds(xyg, xmin=3, xmax=3, ymax=2)
y[4] = np.nan
xyg = Glyph(x=x, y=y)
check_bounds(xyg, xmin=3, xmax=3, ymax=2, ymin=2)
0
Example 62
Project: scikit-bio Source File: distance.py
@experimental(as_of='0.4.2')
def hamming(seq1, seq2):
"""Compute Hamming distance between two sequences.
The Hamming distance between two equal-length sequences is the proportion
of differing characters.
Parameters
----------
seq1, seq2 : Sequence
Sequences to compute Hamming distance between.
Returns
-------
float
Hamming distance between `seq1` and `seq2`.
Raises
------
TypeError
If `seq1` and `seq2` are not ``Sequence`` instances.
TypeError
If `seq1` and `seq2` are not the same type.
ValueError
If `seq1` and `seq2` are not the same length.
See Also
--------
scipy.spatial.distance.hamming
Notes
-----
``np.nan`` will be returned if the sequences do not contain any characters.
This function does not make assumptions about the sequence alphabet in use.
Each sequence object's underlying sequence of characters are used to
compute Hamming distance. Characters that may be considered equivalent in
certain contexts (e.g., `-` and `.` as gap characters) are treated as
distinct characters when computing Hamming distance.
Examples
--------
>>> from skbio import Sequence
>>> from skbio.sequence.distance import hamming
>>> seq1 = Sequence('AGGGTA')
>>> seq2 = Sequence('CGTTTA')
>>> hamming(seq1, seq2)
0.5
"""
_check_seqs(seq1, seq2)
# Hamming requires equal length sequences. We are checking this here
# because the error you would get otherwise is cryptic.
if len(seq1) != len(seq2):
raise ValueError(
"Hamming distance can only be computed between sequences of equal "
"length (%d != %d)" % (len(seq1), len(seq2)))
# scipy throws a RuntimeWarning when computing Hamming distance on length 0
# input.
if not seq1:
distance = np.nan
else:
distance = scipy.spatial.distance.hamming(seq1.values, seq2.values)
return float(distance)
0
Example 63
Project: sfepy Source File: plot_times.py
def main():
parser = ArgumentParser(description=__doc__)
parser.add_argument('--version', action='version', version='%(prog)s')
parser.add_argument('-l', '--logarithmic',
action='store_true', dest='logarithmic',
default=False, help=helps['logarithmic'])
parser.add_argument('filename')
options = parser.parse_args()
filename = options.filename
plt.rcParams['lines.linewidth'] = 3
plt.rcParams['lines.markersize'] = 9
fontsize = 16
steps, times, nts, dts = extract_times(filename)
dts[-1] = nm.nan
ax = plt.subplot(211)
if options.logarithmic:
l1, = ax.semilogy(steps, dts, 'b')
else:
l1, = ax.plot(steps, dts, 'b')
ax.set_xlabel('step', fontsize=fontsize)
ax.set_ylabel(r'$\Delta t$', fontsize=fontsize)
ax.grid(True)
ax = ax.twinx()
l2, = ax.plot(steps, times, 'g')
ax.set_ylabel(r'$t$', fontsize=fontsize)
ax.legend([l1, l2], [r'$\Delta t$', r'$t$'], loc=0)
ax = plt.subplot(212)
if options.logarithmic:
ax.semilogy(times, dts, 'b+')
else:
ax.plot(times, dts, 'b+')
ax.set_xlabel(r'$t$', fontsize=fontsize)
ax.set_ylabel(r'$\Delta t$', fontsize=fontsize)
ax.grid(True)
plt.show()
0
Example 64
Project: lifetimes Source File: utils.py
def summary_data_from_transaction_data(transactions, customer_id_col, datetime_col, monetary_value_col=None, datetime_format=None,
observation_period_end=datetime.today(), freq='D'):
"""
This transforms a Dataframe of transaction data of the form:
customer_id, datetime [, monetary_value]
to a Dataframe of the form:
customer_id, frequency, recency, T [, monetary_value]
Parameters:
transactions: a Pandas DataFrame.
customer_id_col: the column in transactions that denotes the customer_id
datetime_col: the column in transactions that denotes the datetime the purchase was made.
monetary_value_col: the columns in the transactions that denotes the monetary value of the transaction.
Optional, only needed for customer lifetime value estimation models.
observation_period_end: a string or datetime to denote the final date of the study. Events
after this date are truncated.
datetime_format: a string that represents the timestamp format. Useful if Pandas can't understand
the provided format.
freq: Default 'D' for days, 'W' for weeks, 'M' for months... etc. Full list here:
http://pandas.pydata.org/pandas-docs/stable/timeseries.html#dateoffset-objects
"""
observation_period_end = pd.to_datetime(observation_period_end, format=datetime_format).to_period(freq)
# label all of the repeated transactions
repeated_transactions = find_first_transactions(
transactions,
customer_id_col,
datetime_col,
monetary_value_col,
datetime_format,
observation_period_end,
freq
)
# count all orders by customer.
customers = repeated_transactions.groupby(customer_id_col, sort=False)[datetime_col].agg(['min', 'max', 'count'])
# subtract 1 from count, as we ignore their first order.
customers['frequency'] = customers['count'] - 1
customers['T'] = (observation_period_end - customers['min'])
customers['recency'] = (customers['max'] - customers['min'])
summary_columns = ['frequency', 'recency', 'T']
if monetary_value_col:
# create an index of all the first purchases
first_purchases = repeated_transactions[repeated_transactions['first']].index
# by setting the monetary_value cells of all the first purchases to NaN,
# those values will be excluded from the mean value calculation
repeated_transactions.loc[first_purchases, monetary_value_col] = np.nan
customers['monetary_value'] = repeated_transactions.groupby(customer_id_col)[monetary_value_col].mean().fillna(0)
summary_columns.append('monetary_value')
return customers[summary_columns].astype(float)
0
Example 65
Project: zipline Source File: requests_csv.py
def load_df(self):
df = self.fetch_data()
if self.pre_func:
df = self.pre_func(df)
# Batch-convert the user-specifed date column into timestamps.
df['dt'] = self.parse_date_str_series(
self.date_format,
self.timezone,
df[self.date_column],
self.data_frequency,
self.trading_day,
).values
# ignore rows whose dates we couldn't parse
df = df[df['dt'].notnull()]
if self.symbol is not None:
df['sid'] = self.symbol
elif self.finder:
df.sort_values(by=self.symbol_column, inplace=True)
# Pop the 'sid' column off of the DataFrame, just in case the user
# has assigned it, and throw a warning
try:
df.pop('sid')
warnings.warn(
"Assignment of the 'sid' column of a DataFrame is "
"not supported by Fetcher. The 'sid' column has been "
"overwritten.",
category=UserWarning,
stacklevel=2,
)
except KeyError:
# There was no 'sid' column, so no warning is necessary
pass
# Fill entries for any symbols that don't require a date to
# uniquely identify. Entries for which multiple securities exist
# are replaced with zeroes, while entries for which no asset
# exists are replaced with NaNs.
unique_symbols = df[self.symbol_column].unique()
sid_series = pd.Series(
data=map(self._lookup_unconflicted_symbol, unique_symbols),
index=unique_symbols,
name='sid',
)
df = df.join(sid_series, on=self.symbol_column)
# Fill any zero entries left in our sid column by doing a lookup
# using both symbol and the row date.
conflict_rows = df[df['sid'] == 0]
for row_idx, row in conflict_rows.iterrows():
try:
asset = self.finder.lookup_symbol(
row[self.symbol_column],
# Replacing tzinfo here is necessary because of the
# timezone metadata bug described below.
row['dt'].replace(tzinfo=pytz.utc),
# It's possible that no asset comes back here if our
# lookup date is from before any asset held the
# requested symbol. Mark such cases as NaN so that
# they get dropped in the next step.
) or numpy.nan
except SymbolNotFound:
asset = numpy.nan
# Assign the resolved asset to the cell
df.ix[row_idx, 'sid'] = asset
# Filter out rows containing symbols that we failed to find.
length_before_drop = len(df)
df = df[df['sid'].notnull()]
no_sid_count = length_before_drop - len(df)
if no_sid_count:
logger.warn(
"Dropped {} rows from fetched csv.".format(no_sid_count),
no_sid_count,
extra={'syslog': True},
)
else:
df['sid'] = df['symbol']
# Dates are localized to UTC when they come out of
# parse_date_str_series, but we need to re-localize them here because
# of a bug that wasn't fixed until
# https://github.com/pydata/pandas/pull/7092.
# We should be able to remove the call to tz_localize once we're on
# pandas 0.14.0
# We don't set 'dt' as the index until here because the Symbol parsing
# operations above depend on having a unique index for the dataframe,
# and the 'dt' column can contain multiple dates for the same entry.
df.drop_duplicates(["sid", "dt"])
df.set_index(['dt'], inplace=True)
df = df.tz_localize('UTC')
df.sort_index(inplace=True)
cols_to_drop = [self.date_column]
if self.symbol is None:
cols_to_drop.append(self.symbol_column)
df = df[df.columns.drop(cols_to_drop)]
if self.post_func:
df = self.post_func(df)
return df
0
Example 66
Project: yatsm Source File: masking.py
def smooth_mask(x, Y, span, crit=400, green=1, swir1=4,
maxiter=5):
""" Multi-temporal masking using LOWESS
Taken directly from newer version of CCDC than Zhu and Woodcock, 2014.
This "temporal masking" replaced the older method which used robust
linear models. This version uses a regular LOWESS instead of robust
LOWESS
.. note::
"span" argument is the inverse of "frac" from statsmodels and is
actually 'k' in their code:
`n = x.shape[0]`
`k = int(frac * n + 1e-10)`
.. todo::
We need to put the data on a regular period since span changes as
is right now. Statsmodels will only allow for dropna, so we would
need to impute missing data somehow...
Args:
x (np.ndarray): array of ordinal dates
Y (np.ndarray): matrix of observed spectra
span (int): span of LOWESS
crit (float): critical value for masking clouds/shadows
green (int): 0 indexed value for green band in Y (default: 1)
swir1 (int): 0 indexed value for SWIR (~1.55-1.75um) band
in Y (default: 4)
maxiter (int): maximum increases to span when checking for
NaN in LOWESS results
Returns:
mask (ndarray): mask where False indicates values to be masked
"""
# Reverse span to get frac
frac = span / x.shape[0]
# Estimate delta as "good choice": delta = 0.01 * range(exog)
delta = (x.max() - x.min()) * 0.01
# Run LOWESS checking for NaN in output
i = 0
green_lowess, swir1_lowess = np.nan, np.nan
while (np.any(np.isnan(green_lowess)) or
np.any(np.isnan(swir1_lowess))) and i < maxiter:
green_lowess = sm.nonparametric.lowess(Y[green, :], x,
frac=frac, delta=delta)
swir1_lowess = sm.nonparametric.lowess(Y[swir1, :], x,
frac=frac, delta=delta)
span += 1
frac = span / x.shape[0]
i += 1
mask = (((Y[green, :] - green_lowess[:, 1]) < crit) *
((Y[swir1, :] - swir1_lowess[:, 1]) > -crit))
return mask
0
Example 67
Project: trackpy Source File: test_perf.py
def profile_head_single(benchmark):
import gc
results = []
# just in case
gc.collect()
try:
from ctypes import cdll, CDLL
cdll.LoadLibrary("libc.so.6")
libc = CDLL("libc.so.6")
libc.malloc_trim(0)
except:
pass
N = args.hrepeats + args.burnin
results = []
try:
for i in range(N):
gc.disable()
d=dict()
try:
d = benchmark.run()
except KeyboardInterrupt:
raise
except Exception as e: # if a single vbench bursts into flames, don't die.
err=""
try:
err = d.get("traceback")
if err is None:
err = str(e)
except:
pass
print("%s died with:\n%s\nSkipping...\n" % (benchmark.name, err))
results.append(d.get('timing',np.nan))
gc.enable()
gc.collect()
finally:
gc.enable()
if results:
# throw away the burn_in
results = results[args.burnin:]
sys.stdout.write('.')
sys.stdout.flush()
return Series(results, name=benchmark.name)
0
Example 68
Project: findatapy Source File: calculations.py
def calculate_individual_trade_gains(self, signal_data_frame, strategy_returns_data_frame):
"""
calculate_individual_trade_gains - Calculates profits on every trade
Parameters
----------
signal_data_frame : DataFrame
trading signals
strategy_returns_data_frame: DataFrame
returns of strategy to be tested
period_shift : int
number of periods to shift signal
Returns
-------
DataFrame
"""
# signal need to be aligned to NEXT period for returns
signal_data_frame_pushed = signal_data_frame.shift(1)
# find all the trade points
trade_points = ((signal_data_frame - signal_data_frame.shift(1)).abs())
cuemulative = self.create_mult_index(strategy_returns_data_frame)
indices = trade_points > 0
indices.columns = cuemulative.columns
# get P&L for every trade (from the end point - start point)
trade_returns = numpy.nan * cuemulative
trade_points_cuemulative = cuemulative[indices]
# for each set of signals/returns, calculate the trade returns - where there isn't a trade
# assign a NaN
# TODO do in one vectorised step without for loop
for col_name in trade_points_cuemulative:
col = trade_points_cuemulative[col_name]
col = col.dropna()
col = col / col.shift(1) - 1
# TODO experiment with quicker ways of writing below?
# for val in col.index:
# trade_returns.set_value(val, col_name, col[val])
# trade_returns.ix[val, col_name] = col[val]
date_indices = trade_returns.index.searchsorted(col.index)
trade_returns.ix[date_indices, col_name] = col
return trade_returns
0
Example 69
Project: empyrical Source File: stats.py
def alpha_aligned(returns, factor_returns, risk_free=0.0, period=DAILY,
annualization=None, _beta=None):
"""Calculates annualized alpha.
If they are pd.Series, expects returns and factor_returns have already
been aligned on their labels. If np.ndarray, these arguments should have
the same shape.
Parameters
----------
returns : pd.Series or np.ndarray
Daily returns of the strategy, noncuemulative.
- See full explanation in :func:`~empyrical.stats.cuem_returns`.
factor_returns : pd.Series or np.ndarray
Daily noncuemulative returns of the factor to which beta is
computed. Usually a benchmark such as the market.
- This is in the same style as returns.
risk_free : int, float, optional
Constant risk-free return throughout the period. For example, the
interest rate on a three month us treasury bill.
period : str, optional
Defines the periodicity of the 'returns' data for purposes of
annualizing. Value ignored if `annualization` parameter is specified.
Defaults are:
'monthly':12
'weekly': 52
'daily': 252
annualization : int, optional
Used to suppress default values available in `period` to convert
returns into annual returns. Value should be the annual frequency of
`returns`.
- See full explanation in :func:`~empyrical.stats.annual_return`.
_beta : float, optional
The beta for the given inputs, if already known. Will be calculated
internally if not provided.
Returns
-------
float
Alpha.
"""
if len(returns) < 2:
return np.nan
ann_factor = annualization_factor(period, annualization)
if _beta is None:
_beta = beta_aligned(returns, factor_returns, risk_free)
adj_returns = _adjust_returns(returns, risk_free)
adj_factor_returns = _adjust_returns(factor_returns, risk_free)
alpha_series = adj_returns - (_beta * adj_factor_returns)
return nanmean(alpha_series) * ann_factor
0
Example 70
Project: refinery Source File: SuffStatBag.py
def mergeComps(self, kA, kB):
''' Merge components kA, kB into a single component
'''
if self.K <= 1:
raise ValueError('Must have at least 2 components to merge.')
if kB == kA:
raise ValueError('Distinct component ids required.')
for key, dims in self._Fields._FieldDims.items():
if dims is not None and dims != ():
arr = getattr(self._Fields, key)
if self.hasMergeTerm(key) and dims == ('K'):
# some special terms need to be precomputed, like sumLogPiActive
arr[kA] = getattr(self._MergeTerms, key)[kA,kB]
else:
# applies to vast majority of all fields
arr[kA] += arr[kB]
if self.hasELBOTerms():
for key, dims in self._ELBOTerms._FieldDims.items():
if self.hasMergeTerm(key) and dims == ('K'):
arr = getattr(self._ELBOTerms, key)
mArr = getattr(self._MergeTerms, key)
arr[kA] = mArr[kA,kB]
if self.hasMergeTerms():
for key, dims in self._MergeTerms._FieldDims.items():
if dims == ('K','K'):
mArr = getattr(self._MergeTerms, key)
mArr[kA,kA+1:] = np.nan
mArr[:kA,kA] = np.nan
self._Fields.removeComp(kB)
if self.hasELBOTerms():
self._ELBOTerms.removeComp(kB)
if self.hasMergeTerms():
self._MergeTerms.removeComp(kB)
0
Example 71
Project: pyNastran Source File: elements.py
def get_mass_by_element_id(self, element_ids_orig=None, xyz_cid0=None, sort_output=True):
if xyz_cid0 is None:
xyz_cid0 = self.model.grid.get_position_by_node_index()
element_ids, element_ids_orig = self._get_element_ids(element_ids_orig)
if len(element_ids) == 0:
nelements = len(element_ids_orig)
mass = np.full(nelements, np.nan, 'float64')
if sort_output:
i = np.argsort(element_ids_orig)
#print("i =", i, i.shape)
#print("element_ids_orig =", element_ids_orig, element_ids_orig.shape)
return element_ids_orig[i], mass
else:
return element_ids_orig, mass
nelements_orig = len(element_ids_orig)
#print('eids orig = %s' % element_ids_orig)
type_map = {
#'CELAS1' : self.elements_spring.celas1,
'CELAS2' : self.elements_spring.celas2,
'CELAS3' : self.elements_spring.celas3,
'CELAS4' : self.elements_spring.celas4,
'CBAR' : self.cbar,
'CBEAM' : self.cbeam,
'CROD' : self.crod,
'CONROD' : self.conrod,
'CTUBE' : self.ctube,
'CSHEAR' : self.cshear,
'CQUAD4' : self.elements_shell.cquad4,
'CTRIA3' : self.elements_shell.ctria3,
'CTETRA4' : self.elements_solid.ctetra4,
'CPENTA6' : self.elements_solid.cpenta6,
'CHEXA8' : self.elements_solid.chexa8,
'CTETRA10' : self.elements_solid.ctetra10,
'CPENTA15' : self.elements_solid.cpenta15,
'CHEXA20' : self.elements_solid.chexa20,
}
exclude_types = [
'CELAS1', 'CELAS2', 'CELAS3', 'CELAS4',
'CDAMP1', 'CDAMP2', 'CDAMP3', 'CDAMP4',
'CMASS1', 'CMASS2', 'CMASS3', 'CMASS4',
'CBUSH', 'CBUSH1D', 'CBUSH2D',
]
pid_data = self.get_element_ids_by_property_type(element_ids, exclude_types=exclude_types)
element_ids_to_analyze = pid_data[:, 0]
data2 = self.group_elements_by_property_type_and_element_type(self, pid_data)
#self.model.log.debug('**data2 = %s' % data2)
nelements = len(element_ids)
#self.model.log.debug('nelement_ids =', nelements)
eids2 = np.zeros(nelements_orig, dtype='int32')
#mass = full(nelements, nan, dtype='float64')
mass = np.full(nelements_orig, np.nan, dtype='float64')
#self.model.log.debug('mass.shape =', mass.shape)
ni = 0
self.model.log.debug('data2 = %s' % data2)
for (pid, etype), element_ids in iteritems(data2):
#self.model.log.debug('pid=%s eType=%s element_ids=%s' % (pid, eType, element_ids))
elements = type_map[etype]
i = np.searchsorted(elements.element_id, element_ids)
n = len(i)
eids2[ni:ni+n] = elements.element_id[i]
if pid == 0:
# CONROD
pass
else:
self.model.log.debug('*cat pid = %s' % pid)
props = self.get_properties([pid])
if len(props) == 0:
# the property doesn't exist
self.model.log.debug('Property %i does not exist and is needed by %s eid=%i' % (
pid, etype, element_ids[0]))
ni += n
#print('props = %s' % props)
continue
# we only get one property at a time
prop = props[0]
#print(' prop = %s' % str(prop).rstrip())
elements = type_map[etype]
i = np.searchsorted(elements.element_id, element_ids)
n = len(i)
#print('ielements = %s' % i)
if etype in ['CELAS1', 'CELAS2', 'CELAS3', 'CELAS4',]:
pass
elif etype in ['CROD', 'CONROD', 'CBAR', 'CBEAM']:
msg = 'which is required for %ss' % etype
n1 = self.get_nodes(elements.node_ids[i, 0], xyz_cid0, msg=msg)
n2 = self.get_nodes(elements.node_ids[i, 1], xyz_cid0, msg=msg)
length = np.linalg.norm(n2 - n1, axis=1)
#print('prop = %s' % prop)
#print(' calling get_mass_per_area for pid=%s' % (pid))
if etype == 'CONROD':
rho = self.model.materials.get_density_by_material_id(elements.material_id)
mass[ni:ni+n] = length * elements.A[i] * rho[i] + elements.nsm[i]
else:
mpl = prop.get_mass_per_length_by_property_id()
mass[ni:ni+n] = mpl * length
del prop
elif etype in ['CTRIA3', 'CQUAD4', 'CSHEAR']:
if etype == 'CTRIA3':
n1, n2, n3 = (elements.node_ids[i, 0], elements.node_ids[i, 1],
elements.node_ids[i, 2])
n1 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n1), :]
n2 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n2), :]
n3 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n3), :]
area = _ctria3_normal_A(n1, n2, n3,
calculate_area=True, normalize=False)[1]
elif etype in ['CQUAD4', 'CSHEAR']:
n1, n2, n3, n4 = (elements.node_ids[i, 0], elements.node_ids[i, 1],
elements.node_ids[i, 2], elements.node_ids[i, 3])
n1 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n1), :]
n2 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n2), :]
n3 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n3), :]
n4 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n4), :]
area = _cquad4_normal_A(n1, n2, n3, n4,
calculate_area=True, normalize=False)[1]
else:
self.model.log.debug("Element.get_mass doesn't support %s; try %s.get_mass" % (
etype, etype))
ni += n
continue
#print('prop = %s' % prop)
#print(' calling get_mass_per_area for pid=%s' % (pid))
mpa = prop.get_mass_per_area_by_property_id()
mass[ni:ni+n] = mpa * area
del prop
elif etype in ['CTETRA4', 'CTETRA10', 'CPENTA6', 'CPENTA15', 'CHEXA8', 'CHEXA20']:
rho = prop.get_density_by_property_id()
del prop
if etype in ['CTETRA4', 'CTETRA10']:
n1, n2, n3, n4 = (elements.node_ids[i, 0], elements.node_ids[i, 1],
elements.node_ids[i, 2], elements.node_ids[i, 3])
n1 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n1), :]
n2 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n2), :]
n3 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n3), :]
n4 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n4), :]
volumei = np.zeros(n, self.model.float_fmt)
i = 0
for n1i, n2i, n3i, n4i in zip(n1, n2, n3, n4):
volumei[i] = volume4(n1i, n2i, n3i, n4i)
i += 1
elif etype in ['CPENTA6', 'CPENTA15']:
n1, n2, n3, n4, n5, n6 = (elements.node_ids[i, 0], elements.node_ids[i, 1],
elements.node_ids[i, 2], elements.node_ids[i, 3],
elements.node_ids[i, 4], elements.node_ids[i, 5], )
n1 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n1), :]
n2 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n2), :]
n3 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n3), :]
n4 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n4), :]
n5 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n5), :]
n6 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n6), :]
(area1, centroid1) = tri_area_centroid(n1, n2, n3)
(area2, centroid2) = tri_area_centroid(n4, n5, n6)
volumei = (area1 + area2) / 2. * np.linalg.norm(centroid1 - centroid2, axis=1)
elif etype in ['CHEXA8', 'CHEXA20']:
n1, n2, n3, n4, n5, n6, n7, n8 = (
elements.node_ids[i, 0], elements.node_ids[i, 1],
elements.node_ids[i, 2], elements.node_ids[i, 3],
elements.node_ids[i, 4], elements.node_ids[i, 5],
elements.node_ids[i, 6], elements.node_ids[i, 7], )
n1 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n1), :]
n2 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n2), :]
n3 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n3), :]
n4 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n4), :]
n5 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n5), :]
n6 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n6), :]
n7 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n7), :]
n8 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n8), :]
(area1, centroid1) = quad_area_centroid(n1, n2, n3, n4)
(area2, centroid2) = quad_area_centroid(n5, n6, n7, n8)
volumei = (area1 + area2) / 2. * np.linalg.norm(centroid1 - centroid2, axis=1)
else:
msg = "Element.get_mass doesn't support %s; try %s.get_mass" % (etype, etype)
self.model.log.debug(msg)
ni += n
continue
mass[ni:ni+n] = volumei * rho
else:
msg = " Element.get_mass doesn't support %s; try %s.get_mass" % (etype, etype)
self.model.log.debug(msg)
msg = "Element.get_mass doesn't support %s; try %s.get_mass" % (etype, etype)
raise NotImplementedError(msg)
#self.model.log.debug("")
ni += n
#self.model.log.debug('data2 = %s' % data2)
diff = np.setdiff1d(element_ids_orig, element_ids_to_analyze)
#print('A = %s' % element_ids_orig)
#print('B = %s' % element_ids_to_analyze)
#print('A-B = %s' % (diff))
#print('eids2 = %s' % eids2)
#if len(diff):
#print('cuemdiff = %s' % list(diff))
#print('eids out = %s' % eids2)
#print('len(eids2) =', len(eids2))
#print('len(diff) =', len(diff))
eids2[ni:] = diff
if sort_output:
i = np.argsort(eids2)
eids2 = eids2[i]
mass = mass[i]
return eids2, mass
0
Example 72
def get_sequence(sequence, writebox, spaces=False):
"""Returns a sequence
sequence list: the sequence of integers
writebox list: [min x, max x, min y, max y]
"""
nans = np.array([np.nan, np.nan])
nums= nans.copy()
if spaces is False:
each_num_width = (writebox[1] - writebox[0]) / float(len(sequence))
else:
each_num_width = (writebox[1] - writebox[0]) / float(len(sequence)*2 - 1)
for ii, nn in enumerate(sequence):
if spaces is False:
num_writebox = [writebox[0] + each_num_width * ii ,
writebox[0] + each_num_width * (ii+1),
writebox[2], writebox[3]]
else:
num_writebox = [writebox[0] + each_num_width * 2 * ii ,
writebox[0] + each_num_width * 2 * (ii+.5),
writebox[2], writebox[3]]
if isinstance(nn, int):
nn = str(nn)
num = get_raw_data(nn, num_writebox)
nums = np.vstack([nums, num, nans])
return nums
0
Example 73
def load_raw_arrays(self, columns, start_date, end_date, assets):
"""
Parameters
----------
fields : list of str
'sid'
start_dt: Timestamp
Beginning of the window range.
end_dt: Timestamp
End of the window range.
sids : list of int
The asset identifiers in the window.
Returns
-------
list of np.ndarray
A list with an entry per field of ndarrays with shape
(minutes in range, sids) with a dtype of float64, containing the
values for the respective field over start and end dt range.
"""
rolls_by_asset = {}
for asset in assets:
rf = self._roll_finders[asset.roll_style]
rolls_by_asset[asset] = rf.get_rolls(
asset.root_symbol, start_date, end_date, asset.offset)
num_sessions = len(
self.trading_calendar.sessions_in_range(start_date, end_date))
shape = num_sessions, len(assets)
results = []
tc = self._bar_reader.trading_calendar
sessions = tc.sessions_in_range(start_date, end_date)
# Get partitions
partitions_by_asset = {}
for asset in assets:
rolls_by_asset[asset] = rf.get_rolls(
asset.root_symbol, start_date, end_date, asset.offset)
partitions = []
partitions_by_asset[asset] = partitions
rolls = rolls_by_asset[asset]
start = start_date
for roll in rolls:
sid, roll_date = roll
start_loc = sessions.get_loc(start)
if roll_date is not None:
end = roll_date - sessions.freq
end_loc = sessions.get_loc(end)
else:
end = end_date
end_loc = len(sessions) - 1
partitions.append((sid, start, end, start_loc, end_loc))
if roll[-1] is not None:
start = sessions[end_loc + 1]
for column in columns:
if column != 'volume' and column != 'sid':
out = np.full(shape, np.nan)
else:
out = np.zeros(shape, dtype=np.int64)
for i, asset in enumerate(assets):
partitions = partitions_by_asset[asset]
for sid, start, end, start_loc, end_loc in partitions:
if column != 'sid':
result = self._bar_reader.load_raw_arrays(
[column], start, end, [sid])[0][:, 0]
else:
result = int(sid)
out[start_loc:end_loc + 1, i] = result
results.append(out)
return results
0
Example 74
Project: verif Source File: Input.py
def __init__(self, filename):
import csv
Input.__init__(self, filename)
file = open(filename, 'r')
self._units = "Unknown units"
self._variable = "Unknown"
self._pit = None
self._dates = set()
self._offsets = set()
self._stations = set()
self._quantiles = set()
self._thresholds = set()
fields = dict()
obs = dict()
fcst = dict()
cdf = dict()
pit = dict()
x = dict()
indices = dict()
header = None
# Default values if columns not available
offset = 0
date = 0
lat = 0
lon = 0
elev = 0
# Store station data, to ensure we don't have conflicting lat/lon/elev info for the same ids
stationInfo = dict()
shownConflictingWarning = False
import time
start = time.time()
# Read the data into dictionary with (date,offset,lat,lon,elev) as key and obs/fcst as values
for rowstr in file:
if(rowstr[0] == "#"):
curr = rowstr[1:]
curr = curr.split()
if(curr[0] == "variable:"):
self._variable = ' '.join(curr[1:])
elif(curr[0] == "units:"):
self._units = curr[1]
else:
Util.warning("Ignoring line '" + rowstr.strip() + "' in file '" + filename + "'")
else:
row = rowstr.split()
if(header is None):
# Parse the header so we know what each column represents
header = row
for i in range(0, len(header)):
att = header[i]
if(att == "date"):
indices["date"] = i
elif(att == "offset"):
indices["offset"] = i
elif(att == "lat"):
indices["lat"] = i
elif(att == "lon"):
indices["lon"] = i
elif(att == "elev"):
indices["elev"] = i
elif(att == "obs"):
indices["obs"] = i
elif(att == "fcst"):
indices["fcst"] = i
else:
indices[att] = i
# Ensure we have required columns
requiredColumns = ["obs", "fcst"]
for col in requiredColumns:
if(col not in indices):
msg = "Could not parse %s: Missing column '%s'" % (filename, col)
Util.error(msg)
else:
if(len(row) is not len(header)):
Util.error("Incorrect number of columns (expecting %d) in row '%s'"
% (len(header), rowstr.strip()))
if("date" in indices):
date = self._clean(row[indices["date"]])
self._dates.add(date)
if("offset" in indices):
offset = self._clean(row[indices["offset"]])
self._offsets.add(offset)
if("id" in indices):
id = self._clean(row[indices["id"]])
else:
id = np.nan
# Lookup previous stationInfo
currLat = np.nan
currLon = np.nan
currElev = np.nan
if("lat" in indices):
currLat = self._clean(row[indices["lat"]])
if("lon" in indices):
currLon = self._clean(row[indices["lon"]])
if("elev" in indices):
currElev = self._clean(row[indices["elev"]])
if not np.isnan(id) and id in stationInfo:
lat = stationInfo[id].lat()
lon = stationInfo[id].lon()
elev = stationInfo[id].elev()
if not shownConflictingWarning:
if (not np.isnan(currLat) and abs(currLat - lat) > 0.0001) or (not np.isnan(currLon) and abs(currLon - lon) > 0.0001) or (not np.isnan(currElev) and abs(currElev - elev) > 0.001):
print currLat - lat, currLon - lon, currElev - elev
Util.warning("Conflicting lat/lon/elev information: (%f,%f,%f) does not match (%f,%f,%f)" % (currLat, currLon, currElev, lat, lon, elev))
shownConflictingWarning = True
else:
if np.isnan(currLat):
currLat = 0
if np.isnan(currLon):
currLon = 0
if np.isnan(currElev):
currElev = 0
station = Station.Station(id, currLat, currLon, currElev)
self._stations.add(station)
stationInfo[id] = station
lat = stationInfo[id].lat()
lon = stationInfo[id].lon()
elev = stationInfo[id].elev()
key = (date, offset, lat, lon, elev)
obs[key] = self._clean(row[indices["obs"]])
fcst[key] = self._clean(row[indices["fcst"]])
quantileFields = self._getQuantileFields(header)
thresholdFields = self._getThresholdFields(header)
if "pit" in indices:
pit[key] = self._clean(row[indices["pit"]])
for field in quantileFields:
quantile = float(field[1:])
self._quantiles.add(quantile)
key = (date, offset, lat, lon, elev, quantile)
x[key] = self._clean(row[indices[field]])
for field in thresholdFields:
threshold = float(field[1:])
self._thresholds.add(threshold)
key = (date, offset, lat, lon, elev, threshold)
cdf[key] = self._clean(row[indices[field]])
end = time.time()
file.close()
self._dates = list(self._dates)
self._offsets = list(self._offsets)
self._stations = list(self._stations)
self._quantiles = list(self._quantiles)
self._thresholds = np.array(list(self._thresholds))
Ndates = len(self._dates)
Noffsets = len(self._offsets)
Nlocations = len(self._stations)
Nquantiles = len(self._quantiles)
Nthresholds = len(self._thresholds)
# Put the dictionary data into a regular 3D array
self._obs = np.zeros([Ndates, Noffsets, Nlocations], 'float') * np.nan
self._fcst = np.zeros([Ndates, Noffsets, Nlocations], 'float') * np.nan
if(len(pit) != 0):
self._pit = np.zeros([Ndates, Noffsets, Nlocations], 'float') * np.nan
self._cdf = np.zeros([Ndates, Noffsets, Nlocations, Nthresholds], 'float') * np.nan
self._x = np.zeros([Ndates, Noffsets, Nlocations, Nquantiles], 'float') * np.nan
for d in range(0, len(self._dates)):
date = self._dates[d]
end = time.time()
for o in range(0, len(self._offsets)):
offset = self._offsets[o]
for s in range(0, len(self._stations)):
station = self._stations[s]
lat = station.lat()
lon = station.lon()
elev = station.elev()
key = (date, offset, lat, lon, elev)
if(key in obs):
self._obs[d][o][s] = obs[key]
if(key in fcst):
self._fcst[d][o][s] = fcst[key]
if(key in pit):
self._pit[d][o][s] = pit[key]
for q in range(0, len(self._quantiles)):
quantile = self._quantiles[q]
key = (date, offset, lat, lon, elev, quantile)
if(key in x):
self._x[d, o, s, q] = x[key]
for t in range(0, len(self._thresholds)):
threshold = self._thresholds[t]
key = (date, offset, lat, lon, elev, threshold)
if(key in cdf):
self._cdf[d, o, s, t] = cdf[key]
end = time.time()
maxStationId = np.nan
for station in self._stations:
if(np.isnan(maxStationId)):
maxStationId = station.id()
elif(station.id() > maxStationId):
maxStationId = station.id()
counter = 0
if(not np.isnan(maxStationId)):
counter = maxStationId + 1
for station in self._stations:
if(np.isnan(station.id())):
station.id(counter)
counter = counter + 1
self._dates = np.array(self._dates)
self._offsets = np.array(self._offsets)
0
Example 75
def test_roc_curve_toydata():
# Binary classification
y_true = [0, 1]
y_score = [0, 1]
tpr, fpr, _ = roc_curve(y_true, y_score)
roc_auc = roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 1])
assert_array_almost_equal(fpr, [1, 1])
assert_almost_equal(roc_auc, 1.)
y_true = [0, 1]
y_score = [1, 0]
tpr, fpr, _ = roc_curve(y_true, y_score)
roc_auc = roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 1, 1])
assert_array_almost_equal(fpr, [0, 0, 1])
assert_almost_equal(roc_auc, 0.)
y_true = [1, 0]
y_score = [1, 1]
tpr, fpr, _ = roc_curve(y_true, y_score)
roc_auc = roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 1])
assert_array_almost_equal(fpr, [0, 1])
assert_almost_equal(roc_auc, 0.5)
y_true = [1, 0]
y_score = [1, 0]
tpr, fpr, _ = roc_curve(y_true, y_score)
roc_auc = roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 1])
assert_array_almost_equal(fpr, [1, 1])
assert_almost_equal(roc_auc, 1.)
y_true = [1, 0]
y_score = [0.5, 0.5]
tpr, fpr, _ = roc_curve(y_true, y_score)
roc_auc = roc_auc_score(y_true, y_score)
assert_array_almost_equal(tpr, [0, 1])
assert_array_almost_equal(fpr, [0, 1])
assert_almost_equal(roc_auc, .5)
y_true = [0, 0]
y_score = [0.25, 0.75]
# assert UndefinedMetricWarning because of no positive sample in y_true
tpr, fpr, _ = assert_warns(UndefinedMetricWarning, roc_curve, y_true, y_score)
assert_raises(ValueError, roc_auc_score, y_true, y_score)
assert_array_almost_equal(tpr, [0., 0.5, 1.])
assert_array_almost_equal(fpr, [np.nan, np.nan, np.nan])
y_true = [1, 1]
y_score = [0.25, 0.75]
# assert UndefinedMetricWarning because of no negative sample in y_true
tpr, fpr, _ = assert_warns(UndefinedMetricWarning, roc_curve, y_true, y_score)
assert_raises(ValueError, roc_auc_score, y_true, y_score)
assert_array_almost_equal(tpr, [np.nan, np.nan])
assert_array_almost_equal(fpr, [0.5, 1.])
# Multi-label classification task
y_true = np.array([[0, 1], [0, 1]])
y_score = np.array([[0, 1], [0, 1]])
assert_raises(ValueError, roc_auc_score, y_true, y_score, average="macro")
assert_raises(ValueError, roc_auc_score, y_true, y_score,
average="weighted")
assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 1.)
assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 1.)
y_true = np.array([[0, 1], [0, 1]])
y_score = np.array([[0, 1], [1, 0]])
assert_raises(ValueError, roc_auc_score, y_true, y_score, average="macro")
assert_raises(ValueError, roc_auc_score, y_true, y_score,
average="weighted")
assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 0.5)
assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 0.5)
y_true = np.array([[1, 0], [0, 1]])
y_score = np.array([[0, 1], [1, 0]])
assert_almost_equal(roc_auc_score(y_true, y_score, average="macro"), 0)
assert_almost_equal(roc_auc_score(y_true, y_score, average="weighted"), 0)
assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 0)
assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 0)
y_true = np.array([[1, 0], [0, 1]])
y_score = np.array([[0.5, 0.5], [0.5, 0.5]])
assert_almost_equal(roc_auc_score(y_true, y_score, average="macro"), .5)
assert_almost_equal(roc_auc_score(y_true, y_score, average="weighted"), .5)
assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), .5)
assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), .5)
0
Example 76
Project: blaze Source File: test_pandas_compute.py
def test_outer_join():
left = [(1, 'Alice', 100),
(2, 'Bob', 200),
(4, 'Dennis', 400)]
left = DataFrame(left, columns=['id', 'name', 'amount'])
right = [('NYC', 1),
('Boston', 1),
('LA', 3),
('Moscow', 4)]
right = DataFrame(right, columns=['city', 'id'])
lsym = symbol('lsym', 'var * {id: int, name: string, amount: real}')
rsym = symbol('rsym', 'var * {city: string, id: int}')
convert = lambda df: set(df.to_records(index=False).tolist())
assert (convert(compute(join(lsym, rsym), {lsym: left, rsym: right})) ==
set([(1, 'Alice', 100, 'NYC'),
(1, 'Alice', 100, 'Boston'),
(4, 'Dennis', 400, 'Moscow')]))
assert (convert(compute(join(lsym, rsym, how='left'),
{lsym: left, rsym: right})) ==
set([(1, 'Alice', 100, 'NYC'),
(1, 'Alice', 100, 'Boston'),
(2, 'Bob', 200, np.nan),
(4, 'Dennis', 400, 'Moscow')]))
df = compute(join(lsym, rsym, how='right'), {lsym: left, rsym: right})
expected = DataFrame([(1., 'Alice', 100., 'NYC'),
(1., 'Alice', 100., 'Boston'),
(3., np.nan, np.nan, 'lsymA'),
(4., 'Dennis', 400., 'Moscow')],
columns=['id', 'name', 'amount', 'city'])
result = pdsort(df, 'id').to_records(index=False)
expected = pdsort(expected, 'id').to_records(index=False)
np.array_equal(result, expected)
df = compute(join(lsym, rsym, how='outer'), {lsym: left, rsym: right})
expected = DataFrame([(1., 'Alice', 100., 'NYC'),
(1., 'Alice', 100., 'Boston'),
(2., 'Bob', 200., np.nan),
(3., np.nan, np.nan, 'LA'),
(4., 'Dennis', 400., 'Moscow')],
columns=['id', 'name', 'amount', 'city'])
result = pdsort(df, 'id').to_records(index=False)
expected = pdsort(expected, 'id').to_records(index=False)
np.array_equal(result, expected)
0
Example 77
Project: scikit-bio Source File: distance.py
@experimental(as_of='0.5.0')
def kmer_distance(seq1, seq2, k, overlap=True):
"""Compute the kmer distance between a pair of sequences
The kmer distance between two sequences is the fraction of kmers that are
unique to either sequence.
Parameters
----------
seq1, seq2 : Sequence
Sequences to compute kmer distance between.
k : int
The kmer length.
overlap : bool, optional
Defines whether the kmers should be overlapping or not.
Returns
-------
float
kmer distance between `seq1` and `seq2`.
Raises
------
ValueError
If `k` is less than 1.
TypeError
If `seq1` and `seq2` are not ``Sequence`` instances.
TypeError
If `seq1` and `seq2` are not the same type.
Notes
-----
kmer counts are not incorporated in this distance metric.
``np.nan`` will be returned if there are no kmers defined for the
sequences.
Examples
--------
>>> from skbio import Sequence
>>> seq1 = Sequence('ATCGGCGAT')
>>> seq2 = Sequence('GCAGATGTG')
>>> kmer_distance(seq1, seq2, 3) # doctest: +ELLIPSIS
0.9230769230...
"""
_check_seqs(seq1, seq2)
seq1_kmers = set(map(str, seq1.iter_kmers(k, overlap=overlap)))
seq2_kmers = set(map(str, seq2.iter_kmers(k, overlap=overlap)))
all_kmers = seq1_kmers | seq2_kmers
if not all_kmers:
return np.nan
shared_kmers = seq1_kmers & seq2_kmers
number_unique = len(all_kmers) - len(shared_kmers)
fraction_unique = number_unique / len(all_kmers)
return fraction_unique
0
Example 78
Project: iris Source File: __init__.py
def _regularise(grib_message):
"""
Transform a reduced grid to a regular grid using interpolation.
Uses 1d linear interpolation at constant latitude to make the grid
regular. If the longitude dimension is circular then this is taken
into account by the interpolation. If the longitude dimension is not
circular then extrapolation is allowed to make sure all end regular
grid points get a value. In practice this extrapolation is likely to
be minimal.
"""
# Make sure to read any missing values as NaN.
gribapi.grib_set_double(grib_message, "missingValue", np.nan)
# Get full longitude values, these describe the longitude value of
# *every* point in the grid, they are not 1d monotonic coordinates.
lons = gribapi.grib_get_double_array(grib_message, "longitudes")
# Compute the new longitude coordinate for the regular grid.
new_nx = max(gribapi.grib_get_long_array(grib_message, "pl"))
new_x_step = (max(lons) - min(lons)) / (new_nx - 1)
if gribapi.grib_get_long(grib_message, "iScansNegatively"):
new_x_step *= -1
new_lons = np.arange(new_nx) * new_x_step + lons[0]
# Get full latitude and data values, these describe the latitude and
# data values of *every* point in the grid, they are not 1d monotonic
# coordinates.
lats = gribapi.grib_get_double_array(grib_message, "latitudes")
values = gribapi.grib_get_double_array(grib_message, "values")
# Retrieve the distinct latitudes from the GRIB message. GRIBAPI docs
# don't specify if these points are guaranteed to be oriented correctly so
# the safe option is to sort them into ascending (south-to-north) order
# and then reverse the order if necessary.
new_lats = gribapi.grib_get_double_array(grib_message, "distinctLatitudes")
new_lats.sort()
if not gribapi.grib_get_long(grib_message, "jScansPositively"):
new_lats = new_lats[::-1]
ny = new_lats.shape[0]
# Use 1d linear interpolation along latitude circles to regularise the
# reduced data.
cyclic = _longitude_is_cyclic(new_lons)
new_values = np.empty([ny, new_nx], dtype=values.dtype)
for ilat, lat in enumerate(new_lats):
idx = np.where(lats == lat)
llons = lons[idx]
vvalues = values[idx]
if cyclic:
# For cyclic data we insert dummy points at each end to ensure
# we can interpolate to all output longitudes using pure
# interpolation.
cgap = (360 - llons[-1] - llons[0])
llons = np.concatenate(
(llons[0:1] - cgap, llons, llons[-1:] + cgap))
vvalues = np.concatenate(
(vvalues[-1:], vvalues, vvalues[0:1]))
fixed_latitude_interpolator = scipy.interpolate.interp1d(
llons, vvalues)
else:
# Allow extrapolation for non-cyclic data sets to ensure we can
# interpolate to all output longitudes.
fixed_latitude_interpolator = Linear1dExtrapolator(
scipy.interpolate.interp1d(llons, vvalues))
new_values[ilat] = fixed_latitude_interpolator(new_lons)
new_values = new_values.flatten()
# Set flags for the regularised data.
if np.isnan(new_values).any():
# Account for any missing data.
gribapi.grib_set_double(grib_message, "missingValue", np.inf)
gribapi.grib_set(grib_message, "bitmapPresent", 1)
new_values = np.where(np.isnan(new_values), np.inf, new_values)
gribapi.grib_set_long(grib_message, "Nx", int(new_nx))
gribapi.grib_set_double(grib_message,
"iDirectionIncrementInDegrees", float(new_x_step))
gribapi.grib_set_double_array(grib_message, "values", new_values)
gribapi.grib_set_long(grib_message, "jPointsAreConsecutive", 0)
gribapi.grib_set_long(grib_message, "PLPresent", 0)
0
Example 79
def compute(self, txns):
"""Compute the long/short live-to-date transaction level profit and loss. Uses an open average calculation"""
txndata = txns.frame
mktdata = txns.pricer.get_eod_frame()
if not isinstance(mktdata.index, pd.DatetimeIndex):
mktdata.to_timestamp(freq='B')
# get the set of all txn dts and mkt data dts
pl = pd.merge(txndata, mktdata.reset_index(), how='outer', on=TPL.DT)
if pl[TC.PID].isnull().all():
ltd_frame = pd.DataFrame(index=pl.index)
ltd_frame[TPL.DT] = pl[PL.DT]
ltd_frame[TPL.POS] = 0
ltd_frame[TPL.PID] = 0
ltd_frame[TPL.TID] = 0
ltd_frame[TPL.TXN_QTY] = np.nan
ltd_frame[TPL.TXN_PX] = np.nan
ltd_frame[TPL.TXN_FEES] = 0
ltd_frame[TPL.TXN_PREMIUM] = 0
ltd_frame[TPL.TXN_INTENT] = 0
ltd_frame[TPL.TXN_ACTION] = 0
ltd_frame[TPL.CLOSE_PX] = pl[TPL.CLOSE_PX]
ltd_frame[TPL.OPEN_VAL] = 0
ltd_frame[TPL.MKT_VAL] = 0
ltd_frame[TPL.TOT_VAL] = 0
ltd_frame[TPL.DVDS] = 0
ltd_frame[TPL.FEES] = 0
ltd_frame[TPL.RPL_GROSS] = 0
ltd_frame[TPL.RPL] = 0
ltd_frame[TPL.UPL] = 0
ltd_frame[TPL.PL] = 0
return ltd_frame
else:
pl.sort([TC.DT, TC.PID, TC.TID], inplace=1)
pl.reset_index(inplace=1, drop=1)
# check that all days can be priced
has_position = pl[TC.PID] > 0
missing_pxs = pl[MC.CLOSE].isnull()
missing = pl[TC.DT][has_position & missing_pxs]
if len(missing) > 0:
msg = 'insufficient price data: {0} prices missing for dates {1}'
mdates = ','.join([_.strftime('%Y-%m-%d') for _ in set(missing[:5])])
mdates += (len(missing) > 5 and '...' or '')
raise Exception(msg.format(len(missing), mdates))
# Now there is a row for every timestamp. Now compute the pl and fill in where missing data should be
cols = [TC.DT, TC.POS, TC.PID, TC.TID, TC.INTENT, TC.ACTION, TC.FEES, TC.QTY, TC.PX, TC.PREMIUM,
TC.OPEN_VAL]
dts, pos_qtys, pids, tids, intents, sides, txn_fees, txn_qtys, txn_pxs, premiums, open_vals = [pl[c] for c
in
cols]
dvds, closing_pxs, mkt_vals = [pl[c] for c in [MC.DVDS, MC.CLOSE, MC.MKT_VAL]]
# Ensure only end of day is kept for dividends (join will match dvd to any transaction during day
dvds = dvds.where(dts != dts.shift(-1), 0)
# fill in pl dates
open_vals.ffill(inplace=1)
open_vals.fillna(0, inplace=1)
pos_qtys.ffill(inplace=1)
pos_qtys.fillna(0, inplace=1)
# pid is the only tricky one, copy only while position is open
inpos = intents.notnull() | (pos_qtys != 0)
pids = np.where(inpos, pids.ffill(), 0)
pl['pid'] = pids.astype(int)
# Zero fill missing
dvds.fillna(0, inplace=1)
tids.fillna(0, inplace=1)
tids = tids.astype(int)
intents.fillna(0, inplace=1)
intents = intents.astype(int)
sides.fillna(0, inplace=1)
sides = sides.astype(int)
txn_fees.fillna(0, inplace=1)
premiums.fillna(0, inplace=1)
# LTD p/l calculation
fees = txn_fees.cuemsum()
total_vals = premiums.cuemsum()
mkt_vals *= pos_qtys
dvds = (dvds * pos_qtys).cuemsum()
rpl_gross = total_vals - open_vals
rpl = rpl_gross + fees + dvds
upl = mkt_vals + open_vals
tpl = upl + rpl
# build the result
data = OrderedDict()
data[TPL.DT] = dts
data[TPL.POS] = pos_qtys
data[TPL.PID] = pids
data[TPL.TID] = tids
data[TPL.TXN_QTY] = txn_qtys
data[TPL.TXN_PX] = txn_pxs
data[TPL.TXN_FEES] = txn_fees
data[TPL.TXN_PREMIUM] = premiums
data[TPL.TXN_INTENT] = intents
data[TPL.TXN_ACTION] = sides
data[TPL.CLOSE_PX] = closing_pxs
data[TPL.OPEN_VAL] = open_vals
data[TPL.MKT_VAL] = mkt_vals
data[TPL.TOT_VAL] = total_vals
data[TPL.DVDS] = dvds
data[TPL.FEES] = fees
data[TPL.RPL_GROSS] = rpl_gross
data[TPL.RPL] = rpl
data[TPL.UPL] = upl
data[TPL.PL] = tpl
ltd_frame = pd.DataFrame(data, columns=data.keys())
return ltd_frame
0
Example 80
Project: zipline Source File: cumulative.py
def __init__(self, sim_params, treasury_curves, trading_calendar,
create_first_day_stats=False):
self.treasury_curves = treasury_curves
self.trading_calendar = trading_calendar
self.start_session = sim_params.start_session
self.end_session = sim_params.end_session
self.sessions = trading_calendar.sessions_in_range(
self.start_session, self.end_session
)
# Hold on to the trading day before the start,
# used for index of the zero return value when forcing returns
# on the first day.
self.day_before_start = self.start_session - self.sessions.freq
last_day = normalize_date(sim_params.end_session)
if last_day not in self.sessions:
last_day = pd.tseries.index.DatetimeIndex(
[last_day]
)
self.sessions = self.sessions.append(last_day)
self.sim_params = sim_params
self.create_first_day_stats = create_first_day_stats
cont_index = self.sessions
self.cont_index = cont_index
self.cont_len = len(self.cont_index)
empty_cont = np.full(self.cont_len, np.nan)
self.algorithm_returns_cont = empty_cont.copy()
self.benchmark_returns_cont = empty_cont.copy()
self.algorithm_cuemulative_leverages_cont = empty_cont.copy()
self.mean_returns_cont = empty_cont.copy()
self.annualized_mean_returns_cont = empty_cont.copy()
self.mean_benchmark_returns_cont = empty_cont.copy()
self.annualized_mean_benchmark_returns_cont = empty_cont.copy()
# The returns at a given time are read and reset from the respective
# returns container.
self.algorithm_returns = None
self.benchmark_returns = None
self.mean_returns = None
self.annualized_mean_returns = None
self.mean_benchmark_returns = None
self.annualized_mean_benchmark_returns = None
self.algorithm_cuemulative_returns = empty_cont.copy()
self.benchmark_cuemulative_returns = empty_cont.copy()
self.algorithm_cuemulative_leverages = empty_cont.copy()
self.excess_returns = empty_cont.copy()
self.latest_dt_loc = 0
self.latest_dt = cont_index[0]
self.benchmark_volatility = empty_cont.copy()
self.algorithm_volatility = empty_cont.copy()
self.beta = empty_cont.copy()
self.alpha = empty_cont.copy()
self.sharpe = empty_cont.copy()
self.downside_risk = empty_cont.copy()
self.sortino = empty_cont.copy()
self.information = empty_cont.copy()
self.drawdowns = empty_cont.copy()
self.max_drawdowns = empty_cont.copy()
self.max_drawdown = 0
self.max_leverages = empty_cont.copy()
self.max_leverage = 0
self.current_max = -np.inf
self.daily_treasury = pd.Series(index=self.sessions)
self.treasury_period_return = np.nan
self.num_trading_days = 0
0
Example 81
Project: tia Source File: ta.py
def cross_signal(s1, s2, continuous=0):
""" return a signal with the following
1 : when all values of s1 cross all values of s2
-1 : when all values of s2 cross below all values of s2
0 : if s1 < max(s2) and s1 > min(s2)
np.nan : if s1 or s2 contains np.nan at position
s1: Series, DataFrame, float, int, or tuple(float|int)
s2: Series, DataFrame, float, int, or tuple(float|int)
continous: bool, if true then once the signal starts it is always 1 or -1
"""
def _convert(src, other):
if isinstance(src, pd.DataFrame):
return src.min(axis=1, skipna=0), src.max(axis=1, skipna=0)
elif isinstance(src, pd.Series):
return src, src
elif isinstance(src, (int, float)):
s = pd.Series(src, index=other.index)
return s, s
elif isinstance(src, (tuple, list)):
l, u = min(src), max(src)
assert l <= u, 'lower bound must be less than upper bound'
lower, upper = pd.Series(l, index=other.index), pd.Series(u, index=other.index)
return lower, upper
else:
raise Exception('unable to handle type %s' % type(src))
lower1, upper1 = _convert(s1, s2)
lower2, upper2 = _convert(s2, s1)
df = pd.DataFrame({'upper1': upper1, 'lower1': lower1, 'upper2': upper2, 'lower2': lower2})
df.ffill(inplace=True)
signal = pd.Series(np.nan, index=df.index)
signal[df.upper1 > df.upper2] = 1
signal[df.lower1 < df.lower2] = -1
if continuous:
# Just roll with 1, -1
signal = signal.fillna(method='ffill')
m1, m2 = df.upper1.first_valid_index(), df.upper2.first_valid_index()
if m1 is not None or m2 is not None:
m1 = m2 if m1 is None else m1
m2 = m1 if m2 is None else m2
fv = max(m1, m2)
if np.isnan(signal[fv]):
signal[fv] = 0
signal.ffill(inplace=1)
else:
signal[(df.upper1 < df.upper2) & (df.lower1 > df.lower2)] = 0
# special handling when equal, determine where it previously was
eq = (df.upper1 == df.upper2)
if eq.any(): # Set to prior value
tmp = signal[eq]
for i in tmp.index:
loc = signal.index.get_loc(i)
if loc != 0:
u, l = df.upper2.iloc[loc], df.lower2.iloc[loc]
ps = signal.iloc[loc - 1]
if u == l or ps == 1.: # Line coming from above upper bound if ps == 1
signal[i] = ps
else:
signal[i] = 0
eq = (df.lower1 == df.lower2)
if eq.any(): # Set to prior value
tmp = signal[eq]
for i in tmp.index:
loc = signal.index.get_loc(i)
if loc != 0:
u, l = df.upper2.iloc[loc], df.lower2.iloc[loc]
ps = signal.iloc[loc - 1]
if u == l or ps == -1.: # Line coming from below lower bound if ps == -1
signal[i] = ps
else:
signal[i] = 0
return signal
0
Example 82
Project: simpeg Source File: ediFilesUtils.py
def importFiles(self):
"""
Function to import EDI files into a object.
"""
# Constants that are needed for convertion of units
# Temp lists
tmpStaList = []
tmpCompList = ['freq','x','y','z']
tmpCompList.extend(self.comps)
# Make the outarray
dtRI = [(compS.lower().replace('.',''),float) for compS in tmpCompList]
# Loop through all the files
for nrEDI, EDIfile in enumerate(self.filesList):
# Read the file into a list of the lines
with open(EDIfile,'r') as fid:
EDIlines = fid.readlines()
# Find the location
latD, longD, elevM = _findLatLong(EDIlines)
# Transfrom coordinates
transCoord = self._transfromPoints(longD,latD)
# Extract the name of the file (station)
EDIname = EDIfile.split(os.sep)[-1].split('.')[0]
# Arrange the data
staList = [EDIname, EDIfile, transCoord[0], transCoord[1], elevM[0]]
# Add to the station list
tmpStaList.extend(staList)
# Read the frequency data
freq = _findEDIcomp('>FREQ',EDIlines)
# Make the temporary rec array.
tArrRec = ( np.nan*np.ones( (len(freq),len(dtRI)) ) ).view(dtRI) #np.concatenate((freq*np.ones((locs.shape[0],1)),locs,np.nan*np.ones((locs.shape[0],8))),axis=1).view(dtRI)
# Add data to the array
tArrRec['freq'] = mkvc(freq,2)
tArrRec['x'] = mkvc(np.ones((len(freq),1))*transCoord[0],2)
tArrRec['y'] = mkvc(np.ones((len(freq),1))*transCoord[1],2)
tArrRec['z'] = mkvc(np.ones((len(freq),1))*elevM[0],2)
for comp in self.comps:
# Deal with converting units of the impedance tensor
if 'Z' in comp:
unitConvert = self._impUnitEDI2SI
else:
unitConvert = 1
# Rotate the data since EDI x is *north, y *east but Simpeg uses x *east, y *north (* means internal reference frame)
key = [comp.lower().replace('.','').replace(s,t) for s,t in [['xx','yy'],['xy','yx'],['yx','xy'],['yy','xx']] if s in comp.lower()][0]
tArrRec[key] = mkvc(unitConvert*_findEDIcomp('>'+comp,EDIlines),2)
# Make a masked array
mArrRec = np.ma.MaskedArray(rec_to_ndarr(tArrRec),mask=np.isnan(rec_to_ndarr(tArrRec))).view(dtype=tArrRec.dtype)
try:
outTemp = recFunc.stack_arrays((outTemp,mArrRec))
except NameError:
outTemp = mArrRec
# Assign the data
self._data = outTemp
0
Example 83
def handle_data(self, data):
self.history_return_price_class.append(
self.return_price_class.handle_data(data))
self.history_return_price_decorator.append(
self.return_price_decorator.handle_data(data))
self.history_return_args.append(
self.return_args_batch.handle_data(
data, *self.args, **self.kwargs))
self.history_return_not_full.append(
self.return_not_full.handle_data(data))
self.uses_ufunc.handle_data(data)
# check that calling transforms with the same arguments
# is idempotent
self.price_multiple.handle_data(data, 1, extra_arg=1)
if self.price_multiple.full:
pre = self.price_multiple.rolling_panel.get_current().shape[0]
result1 = self.price_multiple.handle_data(data, 1, extra_arg=1)
post = self.price_multiple.rolling_panel.get_current().shape[0]
assert pre == post, "batch transform is appending redundant events"
result2 = self.price_multiple.handle_data(data, 1, extra_arg=1)
assert result1 is result2, "batch transform is not idempotent"
# check that calling transform with the same data, but
# different supplemental arguments results in new
# results.
result3 = self.price_multiple.handle_data(data, 2, extra_arg=1)
assert result1 is not result3, \
"batch transform is not updating for new args"
result4 = self.price_multiple.handle_data(data, 1, extra_arg=2)
assert result1 is not result4,\
"batch transform is not updating for new kwargs"
new_data = deepcopy(data)
for sid in new_data:
new_data[sid]['arbitrary'] = 123
self.history_return_arbitrary_fields.append(
self.return_arbitrary_fields.handle_data(new_data))
# nan every second event price
if self.iter % 2 == 0:
self.history_return_nan.append(
self.return_nan.handle_data(data))
else:
nan_data = deepcopy(data)
for sid in nan_data.iterkeys():
nan_data[sid].price = np.nan
self.history_return_nan.append(
self.return_nan.handle_data(nan_data))
self.iter += 1
# Add a new sid to check that it does not get included
extra_sid_data = deepcopy(data)
extra_sid_data[1] = extra_sid_data[0]
self.history_return_sid_filter.append(
self.return_sid_filter.handle_data(extra_sid_data)
)
# Add a field to check that it does not get included
extra_field_data = deepcopy(data)
extra_field_data[0]['ignore'] = extra_sid_data[0]['price']
self.history_return_field_filter.append(
self.return_field_filter.handle_data(extra_field_data)
)
self.history_return_field_no_filter.append(
self.return_field_no_filter.handle_data(extra_field_data)
)
0
Example 84
def sortino_ratio(returns, required_return=0, period=DAILY,
annualization=None, _downside_risk=None):
"""
Determines the Sortino ratio of a strategy.
Parameters
----------
returns : pd.Series or np.ndarray or pd.DataFrame
Daily returns of the strategy, noncuemulative.
- See full explanation in :func:`~empyrical.stats.cuem_returns`.
required_return: float / series
minimum acceptable return
period : str, optional
Defines the periodicity of the 'returns' data for purposes of
annualizing. Value ignored if `annualization` parameter is specified.
Defaults are:
'monthly':12
'weekly': 52
'daily': 252
annualization : int, optional
Used to suppress default values available in `period` to convert
returns into annual returns. Value should be the annual frequency of
`returns`.
_downside_risk : float, optional
The downside risk of the given inputs, if known. Will be calculated if
not provided.
Returns
-------
float, pd.Series
depends on input type
series ==> float
DataFrame ==> pd.Series
Annualized Sortino ratio.
"""
if len(returns) < 2:
return np.nan
ann_factor = annualization_factor(period, annualization)
adj_returns = _adjust_returns(returns, required_return)
mu = nanmean(adj_returns, axis=0)
dsr = (_downside_risk if _downside_risk is not None
else downside_risk(returns, required_return))
sortino = mu / dsr
return sortino * ann_factor
0
Example 85
def extract(self, inputfile, line):
"""Extract information from the file object inputfile."""
# extract the version number first
if "Program Version" in line:
self.metadata["package_version"] = line.split()[2]
if line[0:15] == "Number of atoms":
natom = int(line.split()[-1])
self.set_attribute('natom', natom)
if line[1:13] == "Total Charge":
charge = int(line.split()[-1])
self.set_attribute('charge', charge)
line = next(inputfile)
mult = int(line.split()[-1])
self.set_attribute('mult', mult)
# SCF convergence output begins with:
#
# --------------
# SCF ITERATIONS
# --------------
#
# However, there are two common formats which need to be handled, implemented as separate functions.
if "SCF ITERATIONS" in line:
self.skip_line(inputfile, 'dashes')
line = next(inputfile)
columns = line.split()
# "Starting incremental Fock matrix formation" doesn't
# necessarily appear before the extended format.
if not columns:
self.parse_scf_expanded_format(inputfile, columns)
# A header with distinct columns indicates the condensed
# format.
elif columns[1] == "Energy":
self.parse_scf_condensed_format(inputfile, columns)
# Assume the extended format.
else:
self.parse_scf_expanded_format(inputfile, columns)
# Information about the final iteration, which also includes the convergence
# targets and the convergence values, is printed separately, in a section like this:
#
# cuem*************************************************
# * SUCCESS *
# * SCF CONVERGED AFTER 9 CYCLES *
# *****************************************************
#
# ...
#
# Total Energy : -382.04963064 Eh -10396.09898 eV
#
# ...
#
# ------------------------- ----------------
# FINAL SINGLE POINT ENERGY -382.049630637
# ------------------------- ----------------
#
# We cannot use this last message as a stop condition in general, because
# often there is vibrational output before it. So we use the 'Total Energy'
# line. However, what comes after that is different for single point calculations
# and in the inner steps of geometry optimizations.
if "SCF CONVERGED AFTER" in line:
if not hasattr(self, "scfenergies"):
self.scfenergies = []
if not hasattr(self, "scfvalues"):
self.scfvalues = []
if not hasattr(self, "scftargets"):
self.scftargets = []
while not "Total Energy :" in line:
line = next(inputfile)
energy = utils.convertor(float(line.split()[3]), "hartree", "eV")
self.scfenergies.append(energy)
self._append_scfvalues_scftargets(inputfile, line)
# Sometimes the SCF does not converge, but does not halt the
# the run (like in bug 3184890). In this this case, we should
# remain consistent and use the energy from the last reported
# SCF cycle. In this case, ORCA print a banner like this:
#
# *****************************************************
# * ERROR *
# * SCF NOT CONVERGED AFTER 8 CYCLES *
# *****************************************************
if "SCF NOT CONVERGED AFTER" in line:
if not hasattr(self, "scfenergies"):
self.scfenergies = []
if not hasattr(self, "scfvalues"):
self.scfvalues = []
if not hasattr(self, "scftargets"):
self.scftargets = []
energy = utils.convertor(self.scfvalues[-1][-1][0], "hartree", "eV")
self.scfenergies.append(energy)
self._append_scfvalues_scftargets(inputfile, line)
# The convergence targets for geometry optimizations are printed at the
# beginning of the output, although the order and their description is
# different than later on. So, try to standardize the names of the criteria
# and save them for later so that we can get the order right.
#
# *****************************
# * Geometry Optimization Run *
# *****************************
#
# Geometry optimization settings:
# Update method Update .... BFGS
# Choice of coordinates CoordSys .... Redundant Internals
# Initial Hessian InHess .... Almoef's Model
#
# Convergence Tolerances:
# Energy Change TolE .... 5.0000e-06 Eh
# Max. Gradient TolMAXG .... 3.0000e-04 Eh/bohr
# RMS Gradient TolRMSG .... 1.0000e-04 Eh/bohr
# Max. Displacement TolMAXD .... 4.0000e-03 bohr
# RMS Displacement TolRMSD .... 2.0000e-03 bohr
#
if line[25:50] == "Geometry Optimization Run":
stars = next(inputfile)
blank = next(inputfile)
line = next(inputfile)
while line[0:23] != "Convergence Tolerances:":
line = next(inputfile)
if hasattr(self, 'geotargets'):
self.logger.warning('The geotargets attribute should not exist yet. There is a problem in the parser.')
self.geotargets = []
self.geotargets_names = []
# There should always be five tolerance values printed here.
for i in range(5):
line = next(inputfile)
name = line[:25].strip().lower().replace('.', '').replace('displacement', 'step')
target = float(line.split()[-2])
self.geotargets_names.append(name)
self.geotargets.append(target)
# The convergence targets for relaxed surface scan steps are printed at the
# beginning of the output, although the order and their description is
# different than later on. So, try to standardize the names of the criteria
# and save them for later so that we can get the order right.
#
# *************************************************************
# * RELAXED SURFACE SCAN STEP 12 *
# * *
# * Dihedral ( 11, 10, 3, 4) : 180.00000000 *
# *************************************************************
#
# Geometry optimization settings:
# Update method Update .... BFGS
# Choice of coordinates CoordSys .... Redundant Internals
# Initial Hessian InHess .... Almoef's Model
#
# Convergence Tolerances:
# Energy Change TolE .... 5.0000e-06 Eh
# Max. Gradient TolMAXG .... 3.0000e-04 Eh/bohr
# RMS Gradient TolRMSG .... 1.0000e-04 Eh/bohr
# Max. Displacement TolMAXD .... 4.0000e-03 bohr
# RMS Displacement TolRMSD .... 2.0000e-03 bohr
if line[25:50] == "RELAXED SURFACE SCAN STEP":
self.is_relaxed_scan = True
blank = next(inputfile)
info = next(inputfile)
stars = next(inputfile)
blank = next(inputfile)
line = next(inputfile)
while line[0:23] != "Convergence Tolerances:":
line = next(inputfile)
self.geotargets = []
self.geotargets_names = []
# There should always be five tolerance values printed here.
for i in range(5):
line = next(inputfile)
name = line[:25].strip().lower().replace('.', '').replace('displacement', 'step')
target = float(line.split()[-2])
self.geotargets_names.append(name)
self.geotargets.append(target)
# After each geometry optimization step, ORCA prints the current convergence
# parameters and the targets (again), so it is a good idea to check that they
# have not changed. Note that the order of these criteria here are different
# than at the beginning of the output, so make use of the geotargets_names created
# before and save the new geovalues in correct order.
#
# ----------------------|Geometry convergence|---------------------
# Item value Tolerance Converged
# -----------------------------------------------------------------
# Energy change 0.00006021 0.00000500 NO
# RMS gradient 0.00031313 0.00010000 NO
# RMS step 0.01596159 0.00200000 NO
# MAX step 0.04324586 0.00400000 NO
# ....................................................
# Max(Bonds) 0.0218 Max(Angles) 2.48
# Max(Dihed) 0.00 Max(Improp) 0.00
# -----------------------------------------------------------------
#
if line[33:53] == "Geometry convergence":
if not hasattr(self, "geovalues"):
self.geovalues = []
headers = next(inputfile)
dashes = next(inputfile)
names = []
values = []
targets = []
line = next(inputfile)
while list(set(line.strip())) != ["."]:
name = line[10:28].strip().lower()
value = float(line.split()[2])
target = float(line.split()[3])
names.append(name)
values.append(value)
targets.append(target)
line = next(inputfile)
# The energy change is normally not printed in the first iteration, because
# there was no previous energy -- in that case assume zero. There are also some
# edge cases where the energy change is not printed, for example when internal
# angles become improper and internal coordinates are rebuilt as in regression
# CuI-MePY2-CH3CN_optxes, and in such cases use NaN.
newvalues = []
for i, n in enumerate(self.geotargets_names):
if (n == "energy change") and (n not in names):
if self.is_relaxed_scan:
newvalues.append(0.0)
else:
newvalues.append(numpy.nan)
else:
newvalues.append(values[names.index(n)])
assert targets[names.index(n)] == self.geotargets[i]
self.geovalues.append(newvalues)
#if not an optimization, determine structure used
if line[0:21] == "CARTESIAN COORDINATES" and not hasattr(self, "atomcoords"):
self.skip_line(inputfile, 'dashes')
atomnos = []
atomcoords = []
line = next(inputfile)
while len(line) > 1:
broken = line.split()
atomnos.append(self.table.number[broken[0]])
atomcoords.append(list(map(float, broken[1:4])))
line = next(inputfile)
self.set_attribute('natom', len(atomnos))
self.set_attribute('atomnos', atomnos)
self.atomcoords = [atomcoords]
# There's always a banner announcing the next geometry optimization cycle,
# which looks something like this:
#
# *************************************************************
# * GEOMETRY OPTIMIZATION CYCLE 2 *
# *************************************************************
if "GEOMETRY OPTIMIZATION CYCLE" in line:
# Keep track of the current cycle jsut in case, because some things
# are printed differently inside the first/last and other cycles.
self.gopt_cycle = int(line.split()[4])
self.skip_lines(inputfile, ['s', 'd', 'text', 'd'])
if not hasattr(self, "atomcoords"):
self.atomcoords = []
atomnos = []
atomcoords = []
for i in range(self.natom):
line = next(inputfile)
broken = line.split()
atomnos.append(self.table.number[broken[0]])
atomcoords.append(list(map(float, broken[1:4])))
self.atomcoords.append(atomcoords)
self.set_attribute('atomnos', atomnos)
if line[21:68] == "FINAL ENERGY EVALUATION AT THE STATIONARY POINT":
if not hasattr(self, 'optdone'):
self.optdone = []
self.optdone.append(len(self.atomcoords))
self.skip_lines(inputfile, ['text', 's', 'd', 'text', 'd'])
atomcoords = []
for i in range(self.natom):
line = next(inputfile)
broken = line.split()
atomcoords.append(list(map(float, broken[1:4])))
self.atomcoords.append(atomcoords)
if "The optimization did not converge" in line:
if not hasattr(self, 'optdone'):
self.optdone = []
if line[0:16] == "ORBITAL ENERGIES":
self.skip_lines(inputfile, ['d', 'text', 'text'])
self.mooccnos = [[]]
self.moenergies = [[]]
line = next(inputfile)
while len(line) > 20: # restricted calcs are terminated by ------
info = line.split()
mooccno = int(float(info[1]))
moenergy = float(info[2])
self.mooccnos[0].append(mooccno)
self.moenergies[0].append(utils.convertor(moenergy, "hartree", "eV"))
line = next(inputfile)
line = next(inputfile)
# handle beta orbitals for UHF
if line[17:35] == "SPIN DOWN ORBITALS":
text = next(inputfile)
self.mooccnos.append([])
self.moenergies.append([])
line = next(inputfile)
while len(line) > 20: # actually terminated by ------
info = line.split()
mooccno = int(float(info[1]))
moenergy = float(info[2])
self.mooccnos[1].append(mooccno)
self.moenergies[1].append(utils.convertor(moenergy, "hartree", "eV"))
line = next(inputfile)
if not hasattr(self, 'humos'):
doubly_occupied = self.mooccnos[0].count(2)
singly_occupied = self.mooccnos[0].count(1)
# Restricted closed-shell.
if doubly_occupied > 0 and singly_occupied == 0:
self.set_attribute('humos', [doubly_occupied - 1])
# Restricted open-shell.
elif doubly_occupied > 0 and singly_occupied > 0:
self.set_attribute('humos', [doubly_occupied + singly_occupied - 1,
doubly_occupied - 1])
# Unrestricted.
else:
assert len(self.moenergies) == 2
assert doubly_occupied == 0
assert self.mooccnos[1].count(2) == 0
nbeta = self.mooccnos[1].count(1)
self.set_attribute('humos', [singly_occupied - 1, nbeta - 1])
# So nbasis was parsed at first with the first pattern, but it turns out that
# semiempirical methods (at least AM1 as reported by Julien Idé) do not use this.
# For this reason, also check for the second patterns, and use it as an assert
# if nbasis was already parsed. Regression PCB_1_122.out covers this test case.
if line[1:32] == "# of contracted basis functions":
self.set_attribute('nbasis', int(line.split()[-1]))
if line[1:27] == "Basis Dimension Dim":
self.set_attribute('nbasis', int(line.split()[-1]))
if line[0:14] == "OVERLAP MATRIX":
self.skip_line(inputfile, 'dashes')
self.aooverlaps = numpy.zeros((self.nbasis, self.nbasis), "d")
for i in range(0, self.nbasis, 6):
self.updateprogress(inputfile, "Overlap")
header = next(inputfile)
size = len(header.split())
for j in range(self.nbasis):
line = next(inputfile)
broken = line.split()
self.aooverlaps[j, i:i+size] = list(map(float, broken[1:size+1]))
# Molecular orbital coefficients.
# This is also where atombasis is parsed.
if line[0:18] == "MOLECULAR ORBITALS":
self.skip_line(inputfile, 'dashes')
mocoeffs = [numpy.zeros((self.nbasis, self.nbasis), "d")]
self.aonames = []
self.atombasis = []
for n in range(self.natom):
self.atombasis.append([])
for spin in range(len(self.moenergies)):
if spin == 1:
self.skip_line(inputfile, 'blank')
mocoeffs.append(numpy.zeros((self.nbasis, self.nbasis), "d"))
for i in range(0, self.nbasis, 6):
self.updateprogress(inputfile, "Coefficients")
self.skip_lines(inputfile, ['numbers', 'energies', 'occs'])
dashes = next(inputfile)
broken = dashes.split()
size = len(broken)
for j in range(self.nbasis):
line = next(inputfile)
broken = line.split()
#only need this on the first time through
if spin == 0 and i == 0:
atomname = line[3:5].split()[0]
num = int(line[0:3])
orbital = broken[1].upper()
self.aonames.append("%s%i_%s" % (atomname, num+1, orbital))
self.atombasis[num].append(j)
temp = []
vals = line[16:-1] # -1 to remove the last blank space
for k in range(0, len(vals), 10):
temp.append(float(vals[k:k+10]))
mocoeffs[spin][i:i+size, j] = temp
self.mocoeffs = mocoeffs
# Basis set information
# ORCA prints this out in a somewhat indirect fashion.
# Therefore, parsing occurs in several steps:
# 1. read which atom belongs to which basis set group
if line[0:21] == "BASIS SET INFORMATION":
line = next(inputfile)
line = next(inputfile)
self.tmp_atnames = [] # temporary attribute, needed later
while(not line[0:5] == '-----'):
if line[0:4] == "Atom":
self.tmp_atnames.append(line[8:12].strip())
line = next(inputfile)
# 2. Read information for the basis set groups
if line[0:25] == "BASIS SET IN INPUT FORMAT":
line = next(inputfile)
line = next(inputfile)
# loop over basis set groups
gbasis_tmp = {}
while(not line[0:5] == '-----'):
if line[1:7] == 'NewGTO':
bas_atname = line.split()[1]
gbasis_tmp[bas_atname] = []
line = next(inputfile)
# loop over contracted GTOs
while(not line[0:6] == ' end;'):
words = line.split()
ang = words[0]
nprim = int(words[1])
# loop over primitives
coeff = []
for iprim in range(nprim):
words = next(inputfile).split()
coeff.append( (float(words[1]), float(words[2])) )
gbasis_tmp[bas_atname].append((ang, coeff))
line = next(inputfile)
line = next(inputfile)
# 3. Assign the basis sets to gbasis
self.gbasis = []
for bas_atname in self.tmp_atnames:
self.gbasis.append(gbasis_tmp[bas_atname])
del self.tmp_atnames
# Read TDDFT information
if line[0:18] == "TD-DFT/TDA EXCITED":
# Could be singlets or triplets
if line.find("SINGLETS") >= 0:
sym = "Singlet"
elif line.find("TRIPLETS") >= 0:
sym = "Triplet"
else:
sym = "Not specified"
if not hasattr(self, "etenergies"):
self.etsecs = []
self.etenergies = []
self.etsyms = []
lookup = {'a': 0, 'b': 1}
line = next(inputfile)
while line.find("STATE") < 0:
line = next(inputfile)
# Contains STATE or is blank
while line.find("STATE") >= 0:
broken = line.split()
self.etenergies.append(float(broken[-2]))
self.etsyms.append(sym)
line = next(inputfile)
sec = []
# Contains SEC or is blank
while line.strip():
start = line[0:8].strip()
start = (int(start[:-1]), lookup[start[-1]])
end = line[10:17].strip()
end = (int(end[:-1]), lookup[end[-1]])
contrib = float(line[35:47].strip())
sec.append([start, end, contrib])
line = next(inputfile)
self.etsecs.append(sec)
line = next(inputfile)
# This will parse etoscs for TD calculations, but note that ORCA generally
# prints two sets, one based on the length form of transition dipole moments,
# the other based on the velocity form. Although these should be identical
# in the basis set limit, in practice they are rarely the same. Here we will
# effectively parse just the spectrum based on the length-form.
if (line[25:44] == "ABSORPTION SPECTRUM" or line[9:28] == "ABSORPTION SPECTRUM") and not hasattr(self, "etoscs"):
self.skip_lines(inputfile, ['d', 'header', 'header', 'd'])
self.etoscs = []
for x in self.etsyms:
osc = next(inputfile).split()[3]
if osc == "spin": # "spin forbidden"
osc = 0
else:
osc = float(osc)
self.etoscs.append(osc)
if line[0:23] == "VIBRATIONAL FREQUENCIES":
self.skip_lines(inputfile, ['d', 'b'])
self.vibfreqs = numpy.zeros((3 * self.natom,), "d")
for i in range(3 * self.natom):
line = next(inputfile)
self.vibfreqs[i] = float(line.split()[1])
if numpy.any(self.vibfreqs[0:6] != 0):
msg = "Modes corresponding to rotations/translations "
msg += "may be non-zero."
self.logger.warning(msg)
self.vibfreqs = self.vibfreqs[6:]
if line[0:12] == "NORMAL MODES":
""" Format:
NORMAL MODES
------------
These modes are the cartesian displacements weighted by the diagonal matrix
M(i,i)=1/sqrt(m[i]) where m[i] is the mass of the displaced atom
Thus, these vectors are normalized but *not* orthogonal
0 1 2 3 4 5
0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
1 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
2 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
...
"""
self.vibdisps = numpy.zeros((3 * self.natom, self.natom, 3), "d")
self.skip_lines(inputfile, ['d', 'b', 'text', 'text', 'text', 'b'])
for mode in range(0, 3 * self.natom, 6):
header = next(inputfile)
for atom in range(self.natom):
x = next(inputfile).split()[1:]
y = next(inputfile).split()[1:]
z = next(inputfile).split()[1:]
self.vibdisps[mode:mode + 6, atom, 0] = x
self.vibdisps[mode:mode + 6, atom, 1] = y
self.vibdisps[mode:mode + 6, atom, 2] = z
self.vibdisps = self.vibdisps[6:]
if line[0:11] == "IR SPECTRUM":
self.skip_lines(inputfile, ['d', 'b', 'header', 'd'])
self.vibirs = numpy.zeros((3 * self.natom,), "d")
line = next(inputfile)
while len(line) > 2:
num = int(line[0:4])
self.vibirs[num] = float(line.split()[2])
line = next(inputfile)
self.vibirs = self.vibirs[6:]
if line[0:14] == "RAMAN SPECTRUM":
self.skip_lines(inputfile, ['d', 'b', 'header', 'd'])
self.vibramans = numpy.zeros((3 * self.natom,), "d")
line = next(inputfile)
while len(line) > 2:
num = int(line[0:4])
self.vibramans[num] = float(line.split()[2])
line = next(inputfile)
self.vibramans = self.vibramans[6:]
# ORCA will print atomic charges along with the spin populations,
# so care must be taken about choosing the proper column.
# Population analyses are performed usually only at the end
# of a geometry optimization or other run, so we want to
# leave just the final atom charges.
# Here is an example for Mulliken charges:
# --------------------------------------------
# MULLIKEN ATOMIC CHARGES AND SPIN POPULATIONS
# --------------------------------------------
# 0 H : 0.126447 0.002622
# 1 C : -0.613018 -0.029484
# 2 H : 0.189146 0.015452
# 3 H : 0.320041 0.037434
# ...
# Sum of atomic charges : -0.0000000
# Sum of atomic spin populations: 1.0000000
if line[:23] == "MULLIKEN ATOMIC CHARGES":
has_spins = "AND SPIN POPULATIONS" in line
if not hasattr(self, "atomcharges"):
self.atomcharges = {}
if has_spins and not hasattr(self, "atomspins"):
self.atomspins = {}
self.skip_line(inputfile, 'dashes')
charges = []
if has_spins:
spins = []
line = next(inputfile)
while line[:21] != "Sum of atomic charges":
charges.append(float(line[8:20]))
if has_spins:
spins.append(float(line[20:]))
line = next(inputfile)
self.atomcharges["mulliken"] = charges
if has_spins:
self.atomspins["mulliken"] = spins
# Things are the same for Lowdin populations, except that the sums
# are not printed (there is a blank line at the end).
if line[:22] == "LOEWDIN ATOMIC CHARGES":
has_spins = "AND SPIN POPULATIONS" in line
if not hasattr(self, "atomcharges"):
self.atomcharges = {}
if has_spins and not hasattr(self, "atomspins"):
self.atomspins = {}
self.skip_line(inputfile, 'dashes')
charges = []
if has_spins:
spins = []
line = next(inputfile)
while line.strip():
charges.append(float(line[8:20]))
if has_spins:
spins.append(float(line[20:]))
line = next(inputfile)
self.atomcharges["lowdin"] = charges
if has_spins:
self.atomspins["lowdin"] = spins
# It is not stated explicitely, but the dipole moment components printed by ORCA
# seem to be in atomic units, so they will need to be converted. Also, they
# are most probably calculated with respect to the origin .
#
# -------------
# DIPOLE MOMENT
# -------------
# X Y Z
# Electronic contribution: 0.00000 -0.00000 -0.00000
# Nuclear contribution : 0.00000 0.00000 0.00000
# -----------------------------------------
# Total Dipole Moment : 0.00000 -0.00000 -0.00000
# -----------------------------------------
# Magnitude (a.u.) : 0.00000
# Magnitude (Debye) : 0.00000
#
if line.strip() == "DIPOLE MOMENT":
self.skip_lines(inputfile, ['d', 'XYZ', 'electronic', 'nuclear', 'd'])
total = next(inputfile)
assert "Total Dipole Moment" in total
reference = [0.0, 0.0, 0.0]
dipole = numpy.array([float(d) for d in total.split()[-3:]])
dipole = utils.convertor(dipole, "ebohr", "Debye")
if not hasattr(self, 'moments'):
self.moments = [reference, dipole]
else:
try:
assert numpy.all(self.moments[1] == dipole)
except AssertionError:
self.logger.warning('Overwriting previous multipole moments with new values')
self.moments = [reference, dipole]
# Static polarizability.
if line.strip() == "THE POLARIZABILITY TENSOR":
if not hasattr(self, 'polarizabilities'):
self.polarizabilities = []
self.skip_lines(inputfile, ['d', 'b'])
line = next(inputfile)
assert line.strip() == "The raw cartesian tensor (atomic units):"
polarizability = []
for _ in range(3):
line = next(inputfile)
polarizability.append(line.split())
self.polarizabilities.append(numpy.array(polarizability))
0
Example 86
def test_mean(self):
"""
Test the ceil division
"""
self.assertAllClose(misc.mean(3),
3)
with warnings.catch_warnings():
warnings.simplefilter("ignore", RuntimeWarning)
self.assertAllClose(misc.mean(np.nan),
np.nan)
self.assertAllClose(misc.mean([[2,3],
[np.nan,np.nan]],
axis=-1),
[2.5,np.nan])
self.assertAllClose(misc.mean([[2,3],
[np.nan,np.nan]],
axis=-1,
keepdims=True),
[[2.5],[np.nan]])
self.assertAllClose(misc.mean([[2,3],
[np.nan,np.nan]],
axis=-2),
[2,3])
self.assertAllClose(misc.mean([[2,3],
[np.nan,np.nan]],
axis=-2,
keepdims=True),
[[2,3]])
self.assertAllClose(misc.mean([[2,3],
[np.nan,np.nan]]),
2.5)
self.assertAllClose(misc.mean([[2,3],
[np.nan,np.nan]],
axis=(-1,-2)),
2.5)
self.assertAllClose(misc.mean([[2,3],
[np.nan,np.nan]],
keepdims=True),
[[2.5]])
pass
0
Example 87
def __init__(self, t=None, m=None, e=None, target=None, meta_features={},
name=None, path=None, channel_names=None):
"""Create a `TimeSeries` object from measurement values/metadata.
See `TimeSeries` docuementation for parameter values.
"""
if t is None and m is None:
raise ValueError("Either times or measurements must be provided.")
elif m is None:
m = _default_values_like(t, value=np.nan)
# If m is 1-dimensional, so are t and e
if _ndim(m) == 1:
self.n_channels = 1
if t is None:
t = _default_values_like(m, upper=DEFAULT_MAX_TIME)
if e is None:
e = _default_values_like(m, value=DEFAULT_ERROR_VALUE)
# If m is 2-dimensional, t and e could be 1d or 2d; default is 1d
elif isinstance(m, np.ndarray) and m.ndim == 2:
self.n_channels = len(m)
if t is None:
t = _default_values_like(m[0], upper=DEFAULT_MAX_TIME)
if e is None:
e = _default_values_like(m[0], value=DEFAULT_ERROR_VALUE)
# If m is ragged (list of 1d arrays), t and e should also be ragged
elif _ndim(m) == 2:
self.n_channels = len(m)
if t is None:
t = _default_values_like(m, upper=DEFAULT_MAX_TIME)
if e is None:
e = _default_values_like(m, value=DEFAULT_ERROR_VALUE)
else:
raise ValueError("m must be a 1D or 2D array, or a 2D list of"
" arrays.")
self.time = _make_array_if_possible(t)
self.measurement = _make_array_if_possible(m)
self.error = _make_array_if_possible(e)
if _ndim(self.time) == 1 and _ndim(self.measurement) == 2:
if isinstance(self.measurement, np.ndarray):
self.time = np.broadcast_to(self.time, self.measurement.shape)
else:
raise ValueError("Times for each channel must be provided if m"
" is a ragged array.")
if _ndim(self.error) == 1 and _ndim(self.measurement) == 2:
if isinstance(self.measurement, np.ndarray):
self.error = np.broadcast_to(self.error, self.measurement.shape)
else:
raise ValueError("Errors for each channel must be provided if m"
" is a ragged array.")
if not (_compatible_shapes(self.measurement, self.time) and
_compatible_shapes(self.measurement, self.error)):
raise ValueError("times, values, errors are not of compatible"
" types/sizes. Please refer to the docstring"
" for list of allowed input types.")
self.target = target
self.meta_features = dict(meta_features)
self.name = name
self.path = path
if channel_names is None:
self.channel_names = ["channel_{}".format(i)
for i in range(self.n_channels)]
else:
self.channel_names = channel_names
0
Example 88
Project: pysystemtrade Source File: correlations.py
def __init__(self, data, log=logtoscreen("optimiser"), frequency="W", date_method="expanding",
rollyears=20,
dict_group=dict(), boring_offdiag=0.99, cleaning=True, **kwargs):
"""
We generate a correlation from eithier a pd.DataFrame, or a list of them if we're pooling
Its important that forward filling, or index / ffill / diff has been done before we begin
:param data: Data to get correlations from
:type data: pd.DataFrame or list if pooling
:param frequency: Downsampling frequency. Must be "D", "W" or bigger
:type frequency: str
:param date_method: Method to pass to generate_fitting_dates
:type date_method: str
:param roll_years: If date_method is "rolling", number of years in window
:type roll_years: int
:param dict_group: dictionary of groupings; used to replace missing values
:type dict_group: dict
:param boring_offdiag: Value used in creating 'boring' matrix, for when no data
:type boring_offdiag: float
:param **kwargs: passed to correlation_single_period
:returns: CorrelationList
"""
cleaning = str2Bool(cleaning)
# grouping dictionary, convert to faster, algo friendly, form
group_dict = group_dict_from_natural(dict_group)
data = df_from_list(data)
column_names = list(data.columns)
data = data.resample(frequency, how="last")
# Generate time periods
fit_dates = generate_fitting_dates(
data, date_method=date_method, rollyears=rollyears)
size = len(column_names)
corr_with_no_data = boring_corr_matrix(size, offdiag=boring_offdiag)
# create a list of correlation matrices
corr_list = []
log.terse("Correlation estimate")
# Now for each time period, estimate correlation
for fit_period in fit_dates:
log.msg("Estimating from %s to %s" %
(fit_period.period_start, fit_period.period_end))
if fit_period.no_data:
# no data to fit with
corr_with_nan = boring_corr_matrix(
size, offdiag=np.nan, diag=np.nan)
corrmat = corr_with_nan
else:
data_for_estimate = data[
fit_period.fit_start:fit_period.fit_end]
corrmat = correlation_single_period(data_for_estimate,
**kwargs)
if cleaning:
current_period_data = data[
fit_period.fit_start:fit_period.fit_end]
must_haves = must_have_item(current_period_data)
# means we can use earlier correlations with sensible values
corrmat = clean_correlation(
corrmat, corr_with_no_data, must_haves)
corr_list.append(corrmat)
setattr(self, "corr_list", corr_list)
setattr(self, "columns", column_names)
setattr(self, "fit_dates", fit_dates)
0
Example 89
def deviation(reference_intervals, estimated_intervals, trim=False):
"""Compute the median deviations between reference
and estimated boundary times.
Examples
--------
>>> ref_intervals, _ = mir_eval.io.load_labeled_intervals('ref.lab')
>>> est_intervals, _ = mir_eval.io.load_labeled_intervals('est.lab')
>>> r_to_e, e_to_r = mir_eval.boundary.deviation(ref_intervals,
... est_intervals)
Parameters
----------
reference_intervals : np.ndarray, shape=(n, 2)
reference segment intervals, in the format returned by
:func:`mir_eval.io.load_intervals` or
:func:`mir_eval.io.load_labeled_intervals`.
estimated_intervals : np.ndarray, shape=(m, 2)
estimated segment intervals, in the format returned by
:func:`mir_eval.io.load_intervals` or
:func:`mir_eval.io.load_labeled_intervals`.
trim : boolean
if ``True``, the first and last intervals are ignored.
Typically, these denote start (0.0) and end-of-track markers.
(Default value = False)
Returns
-------
reference_to_estimated : float
median time from each reference boundary to the
closest estimated boundary
estimated_to_reference : float
median time from each estimated boundary to the
closest reference boundary
"""
validate_boundary(reference_intervals, estimated_intervals, trim)
# Convert intervals to boundaries
reference_boundaries = util.intervals_to_boundaries(reference_intervals)
estimated_boundaries = util.intervals_to_boundaries(estimated_intervals)
# Suppress the first and last intervals
if trim:
reference_boundaries = reference_boundaries[1:-1]
estimated_boundaries = estimated_boundaries[1:-1]
# If we have no boundaries, we get no score.
if len(reference_boundaries) == 0 or len(estimated_boundaries) == 0:
return np.nan, np.nan
dist = np.abs(np.subtract.outer(reference_boundaries,
estimated_boundaries))
estimated_to_reference = np.median(dist.min(axis=0))
reference_to_estimated = np.median(dist.min(axis=1))
return reference_to_estimated, estimated_to_reference
0
Example 90
Project: statsmodels Source File: results_discrete.py
def mnlogit(self):
"""
Results generated with
anes_data = sm.datasets.anes96.load()
anes_exog = anes_data.exog
anes_exog = sm.add_constant(anes_exog, prepend=False)
mlogit_mod = sm.MNLogit(anes_data.endog, anes_exog)
alpha = 10 * np.ones((mlogit_mod.J - 1, mlogit_mod.K))
alpha[-1,:] = 0
mlogit_l1_res = mlogit_mod.fit_regularized(
method='l1', alpha=alpha, trim_mode='auto', auto_trim_tol=0.02,
acc=1e-10)
"""
self.params = [[ 0.00100163, -0.05864195, -0.06147822, -0.04769671, -0.05222987,
-0.09522432],
[ 0. , 0.03186139, 0.12048999, 0.83211915, 0.92330292,
1.5680646 ],
[-0.0218185 , -0.01988066, -0.00808564, -0.00487463, -0.01400173,
-0.00562079],
[ 0. , 0.03306875, 0. , 0.02362861, 0.05486435,
0.14656966],
[ 0. , 0.04448213, 0.03252651, 0.07661761, 0.07265266,
0.0967758 ],
[ 0.90993803, -0.50081247, -2.08285102, -5.26132955, -4.86783179,
-9.31537963]]
self.conf_int = [[[ -0.0646223 , 0.06662556],
[ np.nan, np.nan],
[ -0.03405931, -0.00957768],
[ np.nan, np.nan],
[ np.nan, np.nan],
[ 0.26697895, 1.55289711]],
[[ -0.1337913 , 0.01650741],
[ -0.14477255, 0.20849532],
[ -0.03500303, -0.00475829],
[ -0.11406121, 0.18019871],
[ 0.00479741, 0.08416684],
[ -1.84626136, 0.84463642]],
[[ -0.17237962, 0.04942317],
[ -0.15146029, 0.39244026],
[ -0.02947379, 0.01330252],
[ np.nan, np.nan],
[ -0.02501483, 0.09006785],
[ -3.90379391, -0.26190812]],
[[ -0.12938296, 0.03398954],
[ 0.62612955, 1.03810876],
[ -0.02046322, 0.01071395],
[ -0.13738534, 0.18464256],
[ 0.03017236, 0.12306286],
[ -6.91227465, -3.61038444]],
[[ -0.12469773, 0.02023799],
[ 0.742564 , 1.10404183],
[ -0.02791975, -0.00008371],
[ -0.08491561, 0.19464431],
[ 0.0332926 , 0.11201273],
[ -6.29331126, -3.44235233]],
[[ -0.17165567, -0.01879296],
[ 1.33994079, 1.79618841],
[ -0.02027503, 0.00903345],
[ -0.00267819, 0.29581751],
[ 0.05343135, 0.14012026],
[-11.10419107, -7.52656819]]]
self.bse = [[ 0.03348221, 0.03834221, 0.05658338, 0.04167742, 0.03697408,
0.03899631],
[ np.nan, 0.09012101, 0.13875269, 0.10509867, 0.09221543,
0.11639184],
[ 0.00624543, 0.00771564, 0.01091253, 0.00795351, 0.00710116,
0.00747679],
[ np.nan, 0.07506769, np.nan, 0.08215148, 0.07131762,
0.07614826],
[ np.nan, 0.02024768, 0.02935837, 0.02369699, 0.02008204,
0.02211492],
[ 0.32804638, 0.68646613, 0.92906957, 0.84233441, 0.72729881,
0.91267567]]
self.nnz_params = 32
self.aic = 3019.4391360294126
self.bic = 3174.6431733460686
0
Example 91
Project: finmarketpy Source File: backtestengine.py
def calculate_leverage_factor(self, returns_df, vol_target, vol_max_leverage, vol_periods=60, vol_obs_in_year=252,
vol_rebalance_freq='BM', data_resample_freq=None, data_resample_type='mean',
returns=True, period_shift=0):
"""
calculate_leverage_factor - Calculates the time series of leverage for a specified vol target
Parameters
----------
returns_df : DataFrame
Asset returns
vol_target : float
vol target for assets
vol_max_leverage : float
maximum leverage allowed
vol_periods : int
number of periods to calculate volatility
vol_obs_in_year : int
number of observations in the year
vol_rebalance_freq : str
how often to rebalance
vol_resample_type : str
do we need to resample the underlying data first? (eg. have we got intraday data?)
returns : boolean
is this returns time series or prices?
period_shift : int
should we delay the signal by a number of periods?
Returns
-------
pandas.Dataframe
"""
calculations = Calculations()
filter = Filter()
if data_resample_freq is not None:
return
# TODO not implemented yet
if not returns: returns_df = calculations.calculate_returns(returns_df)
roll_vol_df = calculations.rolling_volatility(returns_df,
periods=vol_periods, obs_in_year=vol_obs_in_year).shift(
period_shift)
# calculate the leverage as function of vol target (with max lev constraint)
lev_df = vol_target / roll_vol_df
lev_df[lev_df > vol_max_leverage] = vol_max_leverage
lev_df = filter.resample_time_series_frequency(lev_df, vol_rebalance_freq, data_resample_type)
returns_df, lev_df = returns_df.align(lev_df, join='left', axis=0)
lev_df = lev_df.fillna(method='ffill')
lev_df.ix[0:vol_periods] = numpy.nan # ignore the first elements before the vol window kicks in
return lev_df
0
Example 92
Project: crab Source File: test_pairwise.py
def test_adjusted_cosine():
""" Check that the pairwise Pearson distances computation"""
#Idepontent Test
X = [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0]]
EFV = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
D = adjusted_cosine(X, X, EFV)
assert_array_almost_equal(D, [[1.]])
#Vector x Non Vector
X = [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0]]
Y = [[]]
EFV = [[]]
assert_raises(ValueError, adjusted_cosine, X, Y, EFV)
#Vector A x Vector B
X = [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0]]
Y = [[3.0, 3.5, 1.5, 5.0, 3.5, 3.0]]
EFV = [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0]]
D = adjusted_cosine(X, Y, EFV)
assert_array_almost_equal(D, [[0.80952381]])
#Vector N x 1
X = [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0], [2.5, 3.5, 3.0, 3.5, 2.5, 3.0]]
Y = [[3.0, 3.5, 1.5, 5.0, 3.5, 3.0]]
EFV = [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0]]
D = adjusted_cosine(X, Y, EFV)
assert_array_almost_equal(D, [[0.80952381], [0.80952381]])
#N-Dimmensional Vectors
X = [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0], [2.5, 3.5, 3.0, 3.5, 2.5, 3.0]]
Y = [[3.0, 3.5, 1.5, 5.0, 3.5, 3.0], [2.5, 3.5, 3.0, 3.5, 2.5, 3.0]]
EFV = [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0]]
D = adjusted_cosine(X, Y, EFV)
assert_array_almost_equal(D, [[0.80952381, 1.], [0.80952381, 1.]])
X = [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0], [3.0, 3.5, 1.5, 5.0, 3.5, 3.0]]
EFV = [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0]]
D = adjusted_cosine(X, X, EFV)
assert_array_almost_equal(D, [[1., 0.80952381], [0.80952381, 1.]])
X = [[1.0, 0.0], [1.0, 1.0]]
Y = [[0.0, 0.0]]
EFV = [[0.0, 0.0]]
D = adjusted_cosine(X, Y, EFV)
assert_array_almost_equal(D, [[np.nan], [np.nan]])
#Test Sparse Matrices
X = csr_matrix(X)
Y = csr_matrix(Y)
EFV = csr_matrix(EFV)
assert_raises(ValueError, adjusted_cosine, X, Y, EFV)
0
Example 93
def channel_capacity(n,m,sum_x=1):
'''
Boyd and Vandenberghe, Convex Optimization, exercise 4.57 page 207
Capacity of a communication channel.
We consider a communication channel, with input x(t)∈{1,..,n} and
output Y(t)∈{1,...,m}, for t=1,2,... .The relation between the
input and output is given statistically:
p_(i,j) = ℙ(Y(t)=i|X(t)=j), i=1,..,m j=1,...,m
The matrix P ∈ ℝ^(m*n) is called the channel transition matrix, and
the channel is called a discrete memoryless channel. Assuming X has a
probability distribution denoted x ∈ ℝ^n, i.e.,
x_j = ℙ(X=j), j=1,...,n
The mutual information between X and Y is given by
∑(∑(x_j p_(i,j)log_2(p_(i,j)/∑(x_k p_(i,k)))))
Then channel capacity C is given by
C = sup I(X;Y).
With a variable change of y = Px this becomes
I(X;Y)= c^T x - ∑(y_i log_2 y_i)
where c_j = ∑(p_(i,j)log_2(p_(i,j)))
'''
# n is the number of different input values
# m is the number of different output values
if n*m == 0:
print('The range of both input and output values must be greater than zero')
return 'failed',np.nan,np.nan
# P is the channel transition matrix
P = np.ones((m,n))
# x is probability distribution of the input signal X(t)
x = cvx.Variable(rows=n,cols=1)
# y is the probability distribution of the output signal Y(t)
y = P*x
# I is the mutual information between x and y
c = np.sum(P*np.log2(P),axis=0)
I = c*x + cvx.sum_entries(cvx.entr(y))
# Channel capacity maximised by maximising the mutual information
obj = cvx.Minimize(-I)
constraints = [cvx.sum_entries(x) == sum_x,x >= 0]
# Form and solve problem
prob = cvx.Problem(obj,constraints)
prob.solve()
if prob.status=='optimal':
return prob.status,prob.value,x.value
else:
return prob.status,np.nan,np.nan
0
Example 94
Project: statsmodels Source File: factormodels.py
def fit_find_nfact(self, maxfact=None, skip_crossval=True, cv_iter=None):
'''estimate the model and selection criteria for up to maxfact factors
The selection criteria that are calculated are AIC, BIC, and R2_adj. and
additionally cross-validation prediction error sum of squares if `skip_crossval`
is false. Cross-validation is not used by default because it can be
time consuming to calculate.
By default the cross-validation method is Leave-one-out on the full dataset.
A different cross-validation sample can be specified as an argument to
cv_iter.
Results are attached in `results_find_nfact`
'''
#print 'OLS on Factors'
if not hasattr(self, 'factors'):
self.calc_factors()
hasconst = self.hasconst
if maxfact is None:
maxfact = self.factors.shape[1] - hasconst
if (maxfact+hasconst) < 1:
raise ValueError('nothing to do, number of factors (incl. constant) should ' +
'be at least 1')
#temporary safety
maxfact = min(maxfact, 10)
y0 = self.endog
results = []
#xred, fact, eva, eve = pca(x0, keepdim=0, normalize=1)
for k in range(1, maxfact+hasconst): #k includes now the constnat
#xred, fact, eva, eve = pca(x0, keepdim=k, normalize=1)
# this is faster and same result
fact = self.factors[:,:k]
res = sm.OLS(y0, fact).fit()
## print 'k =', k
## print res.params
## print 'aic: ', res.aic
## print 'bic: ', res.bic
## print 'llf: ', res.llf
## print 'R2 ', res.rsquared
## print 'R2 adj', res.rsquared_adj
if not skip_crossval:
if cv_iter is None:
cv_iter = LeaveOneOut(len(y0))
prederr2 = 0.
for inidx, outidx in cv_iter:
res_l1o = sm.OLS(y0[inidx], fact[inidx,:]).fit()
#print data.endog[outidx], res.model.predict(data.exog[outidx,:]),
prederr2 += (y0[outidx] -
res_l1o.model.predict(res_l1o.params, fact[outidx,:]))**2.
else:
prederr2 = np.nan
results.append([k, res.aic, res.bic, res.rsquared_adj, prederr2])
self.results_find_nfact = results = np.array(results)
self.best_nfact = np.r_[(np.argmin(results[:,1:3],0), np.argmax(results[:,3],0),
np.argmin(results[:,-1],0))]
0
Example 95
Project: cmonkey2 Source File: BSCM.py
def getVarianceMeanSDvect(ratioVect, n, tolerance = 0.01, maxTime=600, chunkSize=200, verbose=False,
expName=None):
"""Given a ratios matrix and a number of genes, figure out the expected distribution of variances
Will sample background until the mean and sd converge or the operation times out
Will return a list of variances to be used for statistical tests,
or return nan if only nan values in ratioVect
Keyword arguments:
ratioVect -- A a vector of ratios
n -- The number of genes to sample
tolerance -- The fraction tolance to use as a stopping condition (DEFAULT: 0.01)
maxTime -- The approximate maximum time to run in seconds (DEFAULT: 600)
chunkSize -- The number of samples to add between test (DEFAULT: 200)
verbose -- Set to false to suppress output (DEFAULT: False)
expName -- Set to echo this name if verbose = True (DEFAULT: None)
Useage:
varDist = getVarianceMeanSD(ratioVect, n)
"""
ratioVect = [x for x in ratioVect if not math.isnan(x)]
if verbose:
logging.info("Calculating background for %d sampled from %d in %s", n, len(ratioVect), expName)
if n <= 1 or n > len(ratioVect):
return [np.nan]
varList = []
repeat = True
startTime = dt.datetime.now()
while repeat:
newVars = []
for i in range(0, chunkSize):
curSample = random.sample(ratioVect, n)
try:
newVar = np.var(curSample)
except:
newVar = 0
newVars.append(newVar)
if len(varList) > 0: #True if past the first sample
oldMean = np.mean(varList)
oldVar = np.var(varList)
varList = varList+newVars
newMean = np.mean(varList)
newVar = np.var(varList)
meanWinTol = abs(newMean-oldMean) < tolerance*abs(oldMean)
varWinTol = abs(oldVar-newVar) < tolerance*abs(oldVar)
if meanWinTol and varWinTol:
repeat = False
else:
varList = varList+newVars
curTime = dt.datetime.now()
if (curTime-startTime).seconds > maxTime:
repeat = False
return varList
0
Example 96
Project: pyspeckit Source File: cube_fit.py
def cube_fit(cubefilename, outfilename, errfilename=None, scale_keyword=None,
vheight=False, verbose=False, signal_cut=3, verbose_level=2,
clobber=True, **kwargs):
"""
Light-weight wrapper for cube fitting
Takes a cube and error map (error will be computed naively if not given)
and computes moments then fits for each spectrum in the cube. It then
saves the fitted parameters to a reasonably descriptive output file whose
header will look like ::
PLANE1 = 'amplitude'
PLANE2 = 'velocity'
PLANE3 = 'sigma'
PLANE4 = 'err_amplitude'
PLANE5 = 'err_velocity'
PLANE6 = 'err_sigma'
PLANE7 = 'integral'
PLANE8 = 'integral_error'
CDELT3 = 1
CTYPE3 = 'FITPAR'
CRVAL3 = 0
CRPIX3 = 1
Parameters
----------
errfilename: [ None | string name of .fits file ]
A two-dimensional error map to use for computing signal-to-noise cuts
scale_keyword: [ None | Char ]
Keyword to pass to the data cube loader - multiplies cube by the number
indexed by this header kwarg if it exists. e.g., if your cube is in
T_A units and you want T_A*
vheight: [ bool ]
Is there a background to be fit? Used in moment computation
verbose: [ bool ]
verbose_level: [ int ]
How loud will the fitting procedure be? Passed to momenteach and fiteach
signal_cut: [ float ]
Signal-to-Noise ratio minimum. Spectra with a peak below this S/N ratio
will not be fit and will be left blank in the output fit parameter cube
clobber: [ bool ]
Overwrite parameter .fits cube if it exists?
`kwargs` are passed to :class:`pyspeckit.Spectrum.specfit`
"""
# Load the spectrum
sp = pyspeckit.Cube(cubefilename,scale_keyword=scale_keyword)
if os.path.exists(errfilename):
errmap = pyfits.getdata(errfilename)
else:
# very simple error calculation... biased and bad, needs improvement
# try using something from the cubes package
errmap = sp.cube.std(axis=0)
# Run the fitter
sp.mapplot()
# Compute the moments at each position to come up with reasonable guesses.
# This speeds up the process enormously, but can easily mess up the fits if
# there are bad pixels
sp.momenteach(vheight=vheight, verbose=verbose)
sp.fiteach(errmap=errmap, multifit=None, verbose_level=verbose_level,
signal_cut=signal_cut, usemomentcube=True, blank_value=np.nan,
verbose=verbose, **kwargs)
# steal the header from the error map
f = pyfits.open(cubefilename)
# start replacing components of the pyfits object
f[0].data = np.concatenate([sp.parcube,sp.errcube,sp.integralmap])
f[0].header['PLANE1'] = 'amplitude'
f[0].header['PLANE2'] = 'velocity'
f[0].header['PLANE3'] = 'sigma'
f[0].header['PLANE4'] = 'err_amplitude'
f[0].header['PLANE5'] = 'err_velocity'
f[0].header['PLANE6'] = 'err_sigma'
f[0].header['PLANE7'] = 'integral'
f[0].header['PLANE8'] = 'integral_error'
f[0].header['CDELT3'] = 1
f[0].header['CTYPE3'] = 'FITPAR'
f[0].header['CRVAL3'] = 0
f[0].header['CRPIX3'] = 1
# save your work
f.writeto(outfilename, clobber=clobber)
return sp
0
Example 97
Project: empyrical Source File: stats.py
def omega_ratio(returns, risk_free=0.0, required_return=0.0,
annualization=APPROX_BDAYS_PER_YEAR):
"""Determines the Omega ratio of a strategy.
Parameters
----------
returns : pd.Series or np.ndarray
Daily returns of the strategy, noncuemulative.
- See full explanation in :func:`~empyrical.stats.cuem_returns`.
risk_free : int, float
Constant risk-free return throughout the period
required_return : float, optional
Minimum acceptance return of the investor. Threshold over which to
consider positive vs negative returns. It will be converted to a
value appropriate for the period of the returns. E.g. An annual minimum
acceptable return of 100 will translate to a minimum acceptable
return of 0.018.
annualization : int, optional
Factor used to convert the required_return into a daily
value. Enter 1 if no time period conversion is necessary.
Returns
-------
float
Omega ratio.
Note
-----
See https://en.wikipedia.org/wiki/Omega_ratio for more details.
"""
if len(returns) < 2:
return np.nan
if annualization == 1:
return_threshold = required_return
elif required_return <= -1:
return np.nan
else:
return_threshold = (1 + required_return) ** \
(1. / annualization) - 1
returns_less_thresh = returns - risk_free - return_threshold
numer = sum(returns_less_thresh[returns_less_thresh > 0.0])
denom = -1.0 * sum(returns_less_thresh[returns_less_thresh < 0.0])
if denom > 0.0:
return numer / denom
else:
return np.nan
0
Example 98
def load_raw_arrays(self, columns, start_date, end_date, assets):
"""
Parameters
----------
fields : list of str
'open', 'high', 'low', 'close', or 'volume'
start_dt: Timestamp
Beginning of the window range.
end_dt: Timestamp
End of the window range.
sids : list of int
The asset identifiers in the window.
Returns
-------
list of np.ndarray
A list with an entry per field of ndarrays with shape
(minutes in range, sids) with a dtype of float64, containing the
values for the respective field over start and end dt range.
"""
rolls_by_asset = {}
tc = self.trading_calendar
start_session = tc.minute_to_session_label(start_date)
end_session = tc.minute_to_session_label(end_date)
for asset in assets:
rf = self._roll_finders[asset.roll_style]
rolls_by_asset[asset] = rf.get_rolls(
asset.root_symbol,
start_session,
end_session, asset.offset)
sessions = tc.sessions_in_range(start_date, end_date)
minutes = tc.minutes_in_range(start_date, end_date)
num_minutes = len(minutes)
shape = num_minutes, len(assets)
results = []
# Get partitions
partitions_by_asset = {}
for asset in assets:
partitions = []
partitions_by_asset[asset] = partitions
rolls = rolls_by_asset[asset]
start = start_date
for roll in rolls:
sid, roll_date = roll
start_loc = minutes.searchsorted(start)
if roll_date is not None:
_, end = tc.open_and_close_for_session(
roll_date - sessions.freq)
end_loc = minutes.searchsorted(end)
else:
end = end_date
end_loc = len(minutes) - 1
partitions.append((sid, start, end, start_loc, end_loc))
if roll[-1] is not None:
start, _ = tc.open_and_close_for_session(
tc.minute_to_session_label(minutes[end_loc + 1]))
for column in columns:
if column != 'volume':
out = np.full(shape, np.nan)
else:
out = np.zeros(shape, dtype=np.uint32)
for i, asset in enumerate(assets):
partitions = partitions_by_asset[asset]
for sid, start, end, start_loc, end_loc in partitions:
if column != 'sid':
result = self._bar_reader.load_raw_arrays(
[column], start, end, [sid])[0][:, 0]
else:
result = int(sid)
out[start_loc:end_loc + 1, i] = result
results.append(out)
return results
0
Example 99
def spsolve(A, b, permc_spec=None, use_umfpack=True):
"""Solve the sparse linear system Ax=b, where b may be a vector or a matrix.
Parameters
----------
A : ndarray or sparse matrix
The square matrix A will be converted into CSC or CSR form
b : ndarray or sparse matrix
The matrix or vector representing the right hand side of the equation.
If a vector, b.shape must be (n,) or (n, 1).
permc_spec : str, optional
How to permute the columns of the matrix for sparsity preservation.
(default: 'COLAMD')
- ``NATURAL``: natural ordering.
- ``MMD_ATA``: minimum degree ordering on the structure of A^T A.
- ``MMD_AT_PLUS_A``: minimum degree ordering on the structure of A^T+A.
- ``COLAMD``: approximate minimum degree column ordering
use_umfpack : bool, optional
if True (default) then use umfpack for the solution. This is
only referenced if b is a vector and ``scikit-umfpack`` is installed.
Returns
-------
x : ndarray or sparse matrix
the solution of the sparse linear equation.
If b is a vector, then x is a vector of size A.shape[1]
If b is a matrix, then x is a matrix of size (A.shape[1], b.shape[1])
Notes
-----
For solving the matrix expression AX = B, this solver assumes the resulting
matrix X is sparse, as is often the case for very sparse inputs. If the
resulting X is dense, the construction of this sparse result will be
relatively expensive. In that case, consider converting A to a dense
matrix and using scipy.linalg.solve or its variants.
"""
if not (isspmatrix_csc(A) or isspmatrix_csr(A)):
A = csc_matrix(A)
warn('spsolve requires A be CSC or CSR matrix format',
SparseEfficiencyWarning)
# b is a vector only if b have shape (n,) or (n, 1)
b_is_sparse = isspmatrix(b)
if not b_is_sparse:
b = asarray(b)
b_is_vector = ((b.ndim == 1) or (b.ndim == 2 and b.shape[1] == 1))
A.sort_indices()
A = A.asfptype() # upcast to a floating point format
# validate input shapes
M, N = A.shape
if (M != N):
raise ValueError("matrix must be square (has shape %s)" % ((M, N),))
if M != b.shape[0]:
raise ValueError("matrix - rhs dimension mismatch (%s - %s)"
% (A.shape, b.shape[0]))
use_umfpack = use_umfpack and useUmfpack
if b_is_vector and use_umfpack:
if b_is_sparse:
b_vec = b.toarray()
else:
b_vec = b
b_vec = asarray(b_vec, dtype=A.dtype).ravel()
if noScikit:
raise RuntimeError('Scikits.umfpack not installed.')
if A.dtype.char not in 'dD':
raise ValueError("convert matrix data to double, please, using"
" .astype(), or set linsolve.useUmfpack = False")
umf = umfpack.UmfpackContext(_get_umf_family(A))
x = umf.linsolve(umfpack.UMFPACK_A, A, b_vec,
autoTranspose=True)
else:
if b_is_vector and b_is_sparse:
b = b.toarray()
b_is_sparse = False
if not b_is_sparse:
if isspmatrix_csc(A):
flag = 1 # CSC format
else:
flag = 0 # CSR format
options = dict(ColPerm=permc_spec)
x, info = _superlu.gssv(N, A.nnz, A.data, A.indices, A.indptr,
b, flag, options=options)
if info != 0:
warn("Matrix is exactly singular", MatrixRankWarning)
x.fill(np.nan)
if b_is_vector:
x = x.ravel()
else:
# b is sparse
Afactsolve = factorized(A)
if not isspmatrix_csc(b):
warn('spsolve is more efficient when sparse b '
'is in the CSC matrix format', SparseEfficiencyWarning)
b = csc_matrix(b)
# Create a sparse output matrix by repeatedly applying
# the sparse factorization to solve columns of b.
data_segs = []
row_segs = []
col_segs = []
for j in range(b.shape[1]):
bj = b[:, j].A.ravel()
xj = Afactsolve(bj)
w = np.flatnonzero(xj)
segment_length = w.shape[0]
row_segs.append(w)
col_segs.append(np.ones(segment_length, dtype=int)*j)
data_segs.append(np.asarray(xj[w], dtype=A.dtype))
sparse_data = np.concatenate(data_segs)
sparse_row = np.concatenate(row_segs)
sparse_col = np.concatenate(col_segs)
x = A.__class__((sparse_data, (sparse_row, sparse_col)),
shape=b.shape, dtype=A.dtype)
return x
0
Example 100
Project: orange3-text Source File: pubmed.py
def _date_to_iso(date):
possible_date_formats = [
'%Y %b %d',
'%Y %b',
'%Y',
]
season_mapping = {
'fall': 'Sep',
'autumn': 'Sep',
'winter': 'Dec',
'spring': 'Mar',
'summer': 'Jun',
}
date = date.lower()
# Seasons to their respective months.
for season, month in season_mapping.items():
date = date.replace(season, month)
date = date.split('-')[0] # 2015 Sep-Dec --> 2015 Sep
time_var = TimeVariable()
for date_format in possible_date_formats:
try:
date_string = datetime.strptime(
date, date_format
).date().isoformat()
return time_var.parse(date_string)
except ValueError:
continue # Try the next format.
warnings.warn(
'Could not parse "{}" into a date.'.format(date),
RuntimeWarning
)
return time_var.parse(np.nan)