numpy.nan

Here are the examples of the python api numpy.nan taken from open source projects. By voting up you can indicate which examples are most useful and appropriate.

171 Examples 7

Example 51

Project: AlephNull Source File: test_events_through_risk.py
    def test_daily_buy_and_hold(self):

        start_date = datetime.datetime(
            year=2006,
            month=1,
            day=3,
            hour=0,
            minute=0,
            tzinfo=pytz.utc)
        end_date = datetime.datetime(
            year=2006,
            month=1,
            day=5,
            hour=0,
            minute=0,
            tzinfo=pytz.utc)

        sim_params = SimulationParameters(
            period_start=start_date,
            period_end=end_date,
            emission_rate='daily'
        )

        algo = BuyAndHoldAlgorithm(
            sim_params=sim_params,
            data_frequency='daily')

        first_date = datetime.datetime(2006, 1, 3, tzinfo=pytz.utc)
        second_date = datetime.datetime(2006, 1, 4, tzinfo=pytz.utc)
        third_date = datetime.datetime(2006, 1, 5, tzinfo=pytz.utc)

        trade_bar_data = [
            Event({
                'open_price': 10,
                'close_price': 15,
                'price': 15,
                'volume': 1000,
                'sid': 1,
                'dt': first_date,
                'source_id': 'test-trade-source',
                'type': DATASOURCE_TYPE.TRADE
            }),
            Event({
                'open_price': 15,
                'close_price': 20,
                'price': 20,
                'volume': 2000,
                'sid': 1,
                'dt': second_date,
                'source_id': 'test_list',
                'type': DATASOURCE_TYPE.TRADE
            }),
            Event({
                'open_price': 20,
                'close_price': 15,
                'price': 15,
                'volume': 1000,
                'sid': 1,
                'dt': third_date,
                'source_id': 'test_list',
                'type': DATASOURCE_TYPE.TRADE
            }),
        ]
        benchmark_data = [
            Event({
                'returns': 0.1,
                'dt': first_date,
                'source_id': 'test-benchmark-source',
                'type': DATASOURCE_TYPE.BENCHMARK
            }),
            Event({
                'returns': 0.2,
                'dt': second_date,
                'source_id': 'test-benchmark-source',
                'type': DATASOURCE_TYPE.BENCHMARK
            }),
            Event({
                'returns': 0.4,
                'dt': third_date,
                'source_id': 'test-benchmark-source',
                'type': DATASOURCE_TYPE.BENCHMARK
            }),
        ]

        algo.benchmark_return_source = benchmark_data
        algo.sources = list([trade_bar_data])
        gen = algo._create_generator(sim_params)

        # TODO: Hand derive these results.
        #       Currently, the output from the time of this writing to
        #       at least be an early warning against changes.
        expected_algorithm_returns = {
            first_date: 0.0,
            second_date: -0.000350,
            third_date: -0.050018
        }

        # TODO: Hand derive these results.
        #       Currently, the output from the time of this writing to
        #       at least be an early warning against changes.
        expected_sharpe = {
            first_date: np.nan,
            second_date: -31.56903265,
            third_date: -11.459888981,
        }

        for bar in gen:
            current_dt = algo.datetime
            crm = algo.perf_tracker.cuemulative_risk_metrics

            np.testing.assert_almost_equal(
                crm.algorithm_returns[current_dt],
                expected_algorithm_returns[current_dt],
                decimal=6)

            np.testing.assert_almost_equal(
                crm.metrics.sharpe[current_dt],
                expected_sharpe[current_dt],
                decimal=6,
                err_msg="Mismatch at %s" % (current_dt,))

Example 52

Project: finmarketpy Source File: eventstudy.py
    def get_intraday_moves_over_custom_event(self, data_frame_rets, ef_time_frame, vol=False,
                                             minute_start = 5, mins = 3 * 60, min_offset = 0 , create_index = False,
                                             resample = False, freq = 'minutes'):

        filter = Filter()

        ef_time_frame = filter.filter_time_series_by_date(data_frame_rets.index[0], data_frame_rets.index[-1], ef_time_frame)
        ef_time = ef_time_frame.index

        if freq == 'minutes':
            ef_time_start = ef_time - timedelta(minutes = minute_start)
            ef_time_end = ef_time + timedelta(minutes = mins)
            ann_factor = 252 * 1440
        elif freq == 'days':
            ef_time = ef_time_frame.index.normalize()
            ef_time_start = ef_time - timedelta(days = minute_start)
            ef_time_end = ef_time + timedelta(days = mins)
            ann_factor = 252

        ords = range(-minute_start + min_offset, mins + min_offset)

        # all data needs to be equally spaced
        if resample:

            # make sure time series is properly sampled at 1 min intervals
            data_frame_rets = data_frame_rets.resample('1min')
            data_frame_rets = data_frame_rets.fillna(value = 0)
            data_frame_rets = filter.remove_out_FX_out_of_hours(data_frame_rets)

        data_frame_rets['Ind'] = numpy.nan

        start_index = data_frame_rets.index.searchsorted(ef_time_start)
        finish_index = data_frame_rets.index.searchsorted(ef_time_end)

        # not all observation windows will be same length (eg. last one?)

        # fill the indices which represent minutes
        # TODO vectorise this!
        for i in range(0, len(ef_time_frame.index)):
            try:
                data_frame_rets.ix[start_index[i]:finish_index[i], 'Ind'] = ords
            except:
                data_frame_rets.ix[start_index[i]:finish_index[i], 'Ind'] = ords[0:(finish_index[i] - start_index[i])]

        # set the release dates
        data_frame_rets.ix[start_index,'Rel'] = ef_time                                         # set entry points
        data_frame_rets.ix[finish_index + 1,'Rel'] = numpy.zeros(len(start_index))              # set exit points
        data_frame_rets['Rel'] = data_frame_rets['Rel'].fillna(method = 'pad')                  # fill down signals

        data_frame_rets = data_frame_rets[pandas.notnull(data_frame_rets['Ind'])]               # get rid of other

        data_frame = data_frame_rets.pivot(index='Ind',
                                           columns='Rel', values=data_frame_rets.columns[0])

        data_frame.index.names = [None]

        if create_index:
            calculations = Calculations()
            data_frame.ix[-minute_start + min_offset,:] = numpy.nan
            data_frame = calculations.create_mult_index(data_frame)
        else:
            if vol is True:
                # annualise (if vol)
                data_frame = data_frame.rolling(center=False,window=5).std() * math.sqrt(ann_factor)
            else:
                data_frame = data_frame.cuemsum()

        return data_frame

Example 53

Project: statsmodels Source File: test_pca.py
Function: test_replace_missing
    @skipif(WIN32)
    def test_replace_missing(self):
        x = self.x.copy()
        x[::5, ::7] = np.nan

        pc = PCA(x, missing='drop-row')
        x_dropped_row = x[np.logical_not(np.any(np.isnan(x), 1))]
        pc_dropped = PCA(x_dropped_row)
        assert_equal(pc.projection, pc_dropped.projection)
        assert_equal(x, pc.data)

        pc = PCA(x, missing='drop-col')
        x_dropped_col = x[:, np.logical_not(np.any(np.isnan(x), 0))]
        pc_dropped = PCA(x_dropped_col)
        assert_equal(pc.projection, pc_dropped.projection)
        assert_equal(x, pc.data)

        pc = PCA(x, missing='drop-min')
        if x_dropped_row.size > x_dropped_col.size:
            x_dropped_min = x_dropped_row
        else:
            x_dropped_min = x_dropped_col
        pc_dropped = PCA(x_dropped_min)
        assert_equal(pc.projection, pc_dropped.projection)
        assert_equal(x, pc.data)

        pc = PCA(x, ncomp=3, missing='fill-em')
        missing = np.isnan(x)
        mu = nanmean(x, axis=0)
        errors = x - mu
        sigma = np.sqrt(nanmean(errors ** 2, axis=0))
        x_std = errors / sigma
        x_std[missing] = 0.0
        last = x_std[missing]
        delta = 1.0
        count = 0
        while delta > 5e-8:
            pc_temp = PCA(x_std, ncomp=3, standardize=False, demean=False)
            x_std[missing] = pc_temp.projection[missing]
            current = x_std[missing]
            diff = current - last
            delta = np.sqrt(np.sum(diff ** 2)) / np.sqrt(np.sum(current ** 2))
            last = current
            count += 1
        x = self.x + 0.0
        projection = pc_temp.projection * sigma + mu
        x[missing] = projection[missing]
        assert_allclose(pc._adjusted_data, x)
        # Check data for no changes
        assert_equal(self.x, self.x_copy)

        x = self.x
        pc = PCA(x)
        pc_dropped = PCA(x, missing='drop-row')
        assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)

        pc_dropped = PCA(x, missing='drop-col')
        assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)

        pc_dropped = PCA(x, missing='drop-min')
        assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)

        pc = PCA(x, ncomp=3)
        pc_dropped = PCA(x, ncomp=3, missing='fill-em')
        assert_allclose(pc.projection, pc_dropped.projection, atol=DECIMAL_5)

        # Test too many missing for missing='fill-em'
        x = self.x.copy()
        x[:, :] = np.nan
        assert_raises(ValueError, PCA, x, missing='drop-row')
        assert_raises(ValueError, PCA, x, missing='drop-col')
        assert_raises(ValueError, PCA, x, missing='drop-min')
        assert_raises(ValueError, PCA, x, missing='fill-em')

Example 54

Project: tia Source File: plots.py
def plot_return_on_dollar(rets, title='Return on $1', show_maxdd=0, figsize=None, ax=None, append=0, label=None, **plot_args):
    """ Show the cuemulative return of specified rets and max drawdowns if selected."""
    crets = (1. + returns_cuemulative(rets, expanding=1))
    if isinstance(crets, pd.DataFrame):
        tmp = crets.copy()
        for c in tmp.columns:
            s = tmp[c]
            fv = s.first_valid_index()
            fi = s.index.get_loc(fv)
            if fi != 0:
                tmp.ix[fi - 1, c] = 1.
            else:
                if not s.index.freq:
                    # no frequency set
                    freq = guess_freq(s.index)
                    s = s.asfreq(freq)
                first = s.index.shift(-1)[0]
                tmp = pd.concat([pd.DataFrame({c: [1.]}, index=[first]), tmp])
        crets = tmp
        if append:
            toadd = crets.index.shift(1)[-1]
            crets = pd.concat([crets, pd.DataFrame(np.nan, columns=crets.columns, index=[toadd])])
    else:
        fv = crets.first_valid_index()
        fi = crets.index.get_loc(fv)
        if fi != 0:
            crets = crets.copy()
            crets.iloc[fi - 1] = 1.
        else:
            if not crets.index.freq:
                first = crets.asfreq(guess_freq(crets.index)).index.shift(-1)[0]
            else:
                first = crets.index.shift(-1)[0]
            tmp = pd.Series([1.], index=[first])
            tmp = tmp.append(crets)
            crets = tmp

        if append:
            toadd = pd.Series(np.nan, index=[crets.index.shift(1)[-1]])
            crets = crets.append(toadd)

    ax = crets.plot(figsize=figsize, title=title, ax=ax, label=label, **plot_args)
    AxesFormat().Y.apply_format(new_float_formatter()).X.label("").apply(ax)
    #ax.tick_params(labelsize=14)
    if show_maxdd:
        # find the max drawdown available by using original rets
        if isinstance(rets, pd.DataFrame):
            iterator = rets.iteritems()
        else:
            iterator = iter([('', rets)])

        for c, col in iterator:
            dd, dt = max_drawdown(col, inc_date=1)
            lbl = c and c + ' maxdd' or 'maxdd'
            # get cret to place annotation correctly
            if isinstance(crets, pd.DataFrame):
                amt = crets.ix[dt, c]
            else:
                amt = crets[dt]

            bbox_props = dict(boxstyle="round", fc="w", ec="0.5", alpha=0.7)
            # sub = lambda c: c and len(c) > 2 and c[:2] or c
            try:
                dtstr = '{0}'.format(dt.to_period())
            except:
                dtstr = '{0}'.format(dt)

            ax.text(dt, amt, "mdd {0}".format(dtstr).strip(), ha="center",
                    va="center", size=10, bbox=bbox_props)
    plt.tight_layout()

Example 55

Project: deep_recommend_system Source File: histogram_ops_test.py
Function: synthetic_data
def synthetic_data(desired_auc, score_range, num_records, rng, frac_true):
  """Create synthetic boolean_labels and scores with adjustable auc.

  Args:
    desired_auc:  Number in [0, 1], the theoretical AUC of resultant data.
    score_range:  2-tuple, (low, high), giving the range of the resultant scores
    num_records:  Positive integer.  The number of records to return.
    rng:  Initialized np.random.RandomState random number generator
    frac_true:  Number in (0, 1).  Expected fraction of resultant labels that
      will be True.  This is just in expectation...more or less may actually be
      True.

  Returns:
    boolean_labels:  np.array, dtype=bool.
    scores:  np.array, dtype=np.float32
  """
  # We prove here why the method (below) for computing AUC works.  Of course we
  # also checked this against sklearn.metrics.roc_auc_curve.
  #
  # First do this for score_range = [0, 1], then rescale.
  # WLOG assume AUC >= 0.5, otherwise we will solve for AUC >= 0.5 then swap
  # the labels.
  # So for AUC in [0, 1] we create False and True labels
  # and corresponding scores drawn from:
  # F ~ U[0, 1],  T ~ U[x, 1]
  # We have,
  # AUC
  #  = P[T > F]
  #  = P[T > F | F < x] P[F < x] + P[T > F | F > x] P[F > x]
  #  = (1 * x) + (0.5 * (1 - x)).
  # Inverting, we have:
  # x = 2 * AUC - 1, when AUC >= 0.5.
  assert 0 <= desired_auc <= 1
  assert 0 < frac_true < 1

  if desired_auc < 0.5:
    flip_labels = True
    desired_auc = 1 - desired_auc
    frac_true = 1 - frac_true
  else:
    flip_labels = False
  x = 2 * desired_auc - 1

  labels = rng.binomial(1, frac_true, size=num_records).astype(bool)
  num_true = labels.sum()
  num_false = num_records - labels.sum()

  # Draw F ~ U[0, 1], and T ~ U[x, 1]
  false_scores = rng.rand(num_false)
  true_scores = x + rng.rand(num_true) * (1 - x)

  # Reshape [0, 1] to score_range.
  def reshape(scores):
    return score_range[0] + scores * (score_range[1] - score_range[0])

  false_scores = reshape(false_scores)
  true_scores = reshape(true_scores)

  # Place into one array corresponding with the labels.
  scores = np.nan * np.ones(num_records, dtype=np.float32)
  scores[labels] = true_scores
  scores[~labels] = false_scores

  if flip_labels:
    labels = ~labels

  return labels, scores

Example 56

Project: simpeg Source File: View.py
Function: plot_grid
    def plotGrid(self, ax=None, nodes=False, faces=False, centers=False, edges=False, lines=True, showIt=False):
        """Plot the nodal, cell-centered and staggered grids for 1,2 and 3 dimensions.


        .. plot::
            :include-source:

            from SimPEG import Mesh, Utils
            X, Y = Utils.exampleLrmGrid([3,3],'rotate')
            M = Mesh.CurvilinearMesh([X, Y])
            M.plotGrid(showIt=True)

        """
        import matplotlib.pyplot as plt
        import matplotlib
        from mpl_toolkits.mplot3d import Axes3D

        axOpts = {'projection':'3d'} if self.dim == 3 else {}
        if ax is None: ax = plt.subplot(111, **axOpts)

        NN = self.r(self.gridN, 'N', 'N', 'M')
        if self.dim == 2:

            if lines:
                X1 = np.c_[mkvc(NN[0][:-1, :]), mkvc(NN[0][1:, :]), mkvc(NN[0][:-1, :])*np.nan].flatten()
                Y1 = np.c_[mkvc(NN[1][:-1, :]), mkvc(NN[1][1:, :]), mkvc(NN[1][:-1, :])*np.nan].flatten()

                X2 = np.c_[mkvc(NN[0][:, :-1]), mkvc(NN[0][:, 1:]), mkvc(NN[0][:, :-1])*np.nan].flatten()
                Y2 = np.c_[mkvc(NN[1][:, :-1]), mkvc(NN[1][:, 1:]), mkvc(NN[1][:, :-1])*np.nan].flatten()

                X = np.r_[X1, X2]
                Y = np.r_[Y1, Y2]

                ax.plot(X, Y, 'b-')
            if centers:
                ax.plot(self.gridCC[:,0],self.gridCC[:,1],'ro')

            # Nx = self.r(self.normals, 'F', 'Fx', 'V')
            # Ny = self.r(self.normals, 'F', 'Fy', 'V')
            # Tx = self.r(self.tangents, 'E', 'Ex', 'V')
            # Ty = self.r(self.tangents, 'E', 'Ey', 'V')

            # ax.plot(self.gridN[:, 0], self.gridN[:, 1], 'bo')

            # nX = np.c_[self.gridFx[:, 0], self.gridFx[:, 0] + Nx[0]*length, self.gridFx[:, 0]*np.nan].flatten()
            # nY = np.c_[self.gridFx[:, 1], self.gridFx[:, 1] + Nx[1]*length, self.gridFx[:, 1]*np.nan].flatten()
            # ax.plot(self.gridFx[:, 0], self.gridFx[:, 1], 'rs')
            # ax.plot(nX, nY, 'r-')

            # nX = np.c_[self.gridFy[:, 0], self.gridFy[:, 0] + Ny[0]*length, self.gridFy[:, 0]*np.nan].flatten()
            # nY = np.c_[self.gridFy[:, 1], self.gridFy[:, 1] + Ny[1]*length, self.gridFy[:, 1]*np.nan].flatten()
            # #ax.plot(self.gridFy[:, 0], self.gridFy[:, 1], 'gs')
            # ax.plot(nX, nY, 'g-')

            # tX = np.c_[self.gridEx[:, 0], self.gridEx[:, 0] + Tx[0]*length, self.gridEx[:, 0]*np.nan].flatten()
            # tY = np.c_[self.gridEx[:, 1], self.gridEx[:, 1] + Tx[1]*length, self.gridEx[:, 1]*np.nan].flatten()
            # ax.plot(self.gridEx[:, 0], self.gridEx[:, 1], 'r^')
            # ax.plot(tX, tY, 'r-')

            # nX = np.c_[self.gridEy[:, 0], self.gridEy[:, 0] + Ty[0]*length, self.gridEy[:, 0]*np.nan].flatten()
            # nY = np.c_[self.gridEy[:, 1], self.gridEy[:, 1] + Ty[1]*length, self.gridEy[:, 1]*np.nan].flatten()
            # #ax.plot(self.gridEy[:, 0], self.gridEy[:, 1], 'g^')
            # ax.plot(nX, nY, 'g-')

        elif self.dim == 3:
            X1 = np.c_[mkvc(NN[0][:-1, :, :]), mkvc(NN[0][1:, :, :]), mkvc(NN[0][:-1, :, :])*np.nan].flatten()
            Y1 = np.c_[mkvc(NN[1][:-1, :, :]), mkvc(NN[1][1:, :, :]), mkvc(NN[1][:-1, :, :])*np.nan].flatten()
            Z1 = np.c_[mkvc(NN[2][:-1, :, :]), mkvc(NN[2][1:, :, :]), mkvc(NN[2][:-1, :, :])*np.nan].flatten()

            X2 = np.c_[mkvc(NN[0][:, :-1, :]), mkvc(NN[0][:, 1:, :]), mkvc(NN[0][:, :-1, :])*np.nan].flatten()
            Y2 = np.c_[mkvc(NN[1][:, :-1, :]), mkvc(NN[1][:, 1:, :]), mkvc(NN[1][:, :-1, :])*np.nan].flatten()
            Z2 = np.c_[mkvc(NN[2][:, :-1, :]), mkvc(NN[2][:, 1:, :]), mkvc(NN[2][:, :-1, :])*np.nan].flatten()

            X3 = np.c_[mkvc(NN[0][:, :, :-1]), mkvc(NN[0][:, :, 1:]), mkvc(NN[0][:, :, :-1])*np.nan].flatten()
            Y3 = np.c_[mkvc(NN[1][:, :, :-1]), mkvc(NN[1][:, :, 1:]), mkvc(NN[1][:, :, :-1])*np.nan].flatten()
            Z3 = np.c_[mkvc(NN[2][:, :, :-1]), mkvc(NN[2][:, :, 1:]), mkvc(NN[2][:, :, :-1])*np.nan].flatten()

            X = np.r_[X1, X2, X3]
            Y = np.r_[Y1, Y2, Y3]
            Z = np.r_[Z1, Z2, Z3]

            ax.plot(X, Y, 'b', zs=Z)
            ax.set_zlabel('x3')

        ax.grid(True)
        ax.set_xlabel('x1')
        ax.set_ylabel('x2')

        if showIt:
            plt.show()

Example 57

Project: bctpy Source File: distance.py
Function: charpath
def charpath(D, include_diagonal=False, include_infinite=True):
    '''
    The characteristic path length is the average shortest path length in
    the network. The global efficiency is the average inverse shortest path
    length in the network.

    Parameters
    ----------
    D : NxN np.ndarray
        distance matrix
    include_diagonal : bool
        If True, include the weights on the diagonal. Default value is False.
    include_infinite : bool
        If True, include infinite distances in calculation

    Returns
    -------
    lambda : float
        characteristic path length
    efficiency : float
        global efficiency
    ecc : Nx1 np.ndarray
        eccentricity at each vertex
    radius : float
        radius of graph
    diameter : float
        diameter of graph

    Notes
    -----
    The input distance matrix may be obtained with any of the distance
    functions, e.g. distance_bin, distance_wei.
    Characteristic path length is calculated as the global mean of
    the distance matrix D, excludings any 'Infs' but including distances on
    the main diagonal.
    '''
    D = D.copy()

    if not include_diagonal:
        np.fill_diagonal(D, np.nan)

    if not include_infinite:
        D[np.isinf(D)] = np.nan

    Dv = D[np.logical_not(np.isnan(D))].ravel()

    # mean of finite entries of D[G]
    lambda_ = np.mean(Dv)

    # efficiency: mean of inverse entries of D[G]
    efficiency = np.mean(1 / Dv)

    # eccentricity for each vertex (ignore inf)
    ecc = np.array(np.ma.masked_equal(D, np.nan).max(axis=1))

    # radius of graph
    radius = np.min(ecc)  # but what about zeros?

    # diameter of graph
    diameter = np.max(ecc)

    return lambda_, efficiency, ecc, radius, diameter

Example 58

Project: pysystemtrade Source File: test_Accounting.py
    def test_get_trades_from_positions(self):
        positions = pd.DataFrame(
            [np.nan, 2, 3, np.nan, 2, 3, 3.1, 4, 3, 5, 7],
            dt_range2)

        price = pd.DataFrame(
            [100, 103, np.nan, 106, 110, 105, np.nan, 106, 120, np.nan, 142],
            dt_range2)

        # test delayed fill
        delayfill = True
        roundpositions = True
        get_daily_returns_volatility = None
        forecast = None
        fx = None
        value_of_price_point = None
        trades = get_trades_from_positions(price,
                                           positions,
                                           delayfill,
                                           roundpositions,
                                           get_daily_returns_volatility,
                                           forecast,
                                           fx,
                                           value_of_price_point)

        np.testing.assert_almost_equal(
            trades.trades,
            [2.0, 1.0, -1.0, 1.0, 1.0, -1.0, 2.0, 2.0])

        np.testing.assert_almost_equal(
            trades.fill_price[:-1],
            [106.0, 106.0, 105.0, 106.0, 120.0, 142.0, 142.0])

        # test none delayed fill
        delayfill = False
        trades = get_trades_from_positions(price,
                                           positions,
                                           delayfill,
                                           roundpositions,
                                           get_daily_returns_volatility,
                                           forecast,
                                           fx,
                                           value_of_price_point)

        np.testing.assert_almost_equal(
            trades.trades,
            [2.0, 1.0, -1.0, 1.0, 1.0, -1.0, 2.0, 2.0])

        np.testing.assert_almost_equal(
            trades.fill_price,
            [103.0, 106.0, 110.0, 105.0, 106.0, 120.0, 142.0, 142.0])

        # test roundpositions
        delayfill = True
        roundpositions = False
        trades = get_trades_from_positions(price,
                                           positions,
                                           delayfill,
                                           roundpositions,
                                           get_daily_returns_volatility,
                                           forecast,
                                           fx,
                                           value_of_price_point)

        np.testing.assert_almost_equal(
            trades.trades,
            [2.0, 1.0, -1.0, 1.0, 0.1, 0.9, -1.0, 2.0, 2.0])

        np.testing.assert_almost_equal(
            trades.fill_price[:-1],
            [106.0, 106.0, 105.0, 106.0, 106.0, 120.0, 142.0, 142.0])

Example 59

Project: trtools Source File: test_wrangling.py
Function: test_pairwise
    def test_pairwise(self):
        df = pd.DataFrame(index=list(range(10)))
        for x in range(3):
            df[x] = list(range(x, x+10))

        nandf = df.copy().astype(float)
        nandf.ix[9:,1] = np.nan

        # test with order=True
        # test with permutations
        pairs = pairwise(df, lambda x, y: x.sum() - y.sum())
        expected = pd.DataFrame([[0, -10, -20], 
                                 [10, 0, -10],
                                 [20, 10, 0]], index=list(range(3)), dtype=float)
        tm.assert_frame_equal(pairs, expected)

        # test with combinations
        pairs = pairwise(df, lambda x, y: x.sum() - y.sum(), order=False)
        expected = pd.DataFrame([[0, -10, -20], 
                                 [-10, 0, -10],
                                 [-20, -10, 0]], index=list(range(3)), dtype=float)
        tm.assert_frame_equal(pairs, expected)

        # test with combinations and values
        # use nandf to test. np.ndarray.sum() returns NaN if it contains nan
        pairs = pairwise(nandf, lambda x, y: x.sum() - y.sum(), order=True, 
                         force_values=True)
        expected = pd.DataFrame([[0, np.nan, -20], 
                                 [np.nan, np.nan, np.nan],
                                 [20, np.nan, 0]], index=list(range(3)), dtype=float)
        tm.assert_frame_equal(pairs, expected)

        # test with np.nansum.
        pairs = pairwise(nandf, lambda x, y: np.nansum(x) - np.nansum(y), 
                         order=True, force_values=True)
        expected = pd.DataFrame([[0, 0, -20], 
                                 [0, 0, -20],
                                 [20, 20, 0]], index=list(range(3)), dtype=float)
        tm.assert_frame_equal(pairs, expected)

        # the np.nansum version should be same as Series.sum version
        pairs_series = pairwise(nandf, lambda x, y: x.sum() - y.sum(), 
                         order=True, force_values=False)
        tm.assert_frame_equal(pairs, pairs_series)

Example 60

Project: scikit-image Source File: _geometric.py
Function: estimate
    def estimate(self, src, dst):
        """Estimate the transformation from a set of corresponding points.

        You can determine the over-, well- and under-determined parameters
        with the total least-squares method.

        Number of source and destination coordinates must match.

        The transformation is defined as::

            X = (a0*x + a1*y + a2) / (c0*x + c1*y + 1)
            Y = (b0*x + b1*y + b2) / (c0*x + c1*y + 1)

        These equations can be transformed to the following form::

            0 = a0*x + a1*y + a2 - c0*x*X - c1*y*X - X
            0 = b0*x + b1*y + b2 - c0*x*Y - c1*y*Y - Y

        which exist for each set of corresponding points, so we have a set of
        N * 2 equations. The coefficients appear linearly so we can write
        A x = 0, where::

            A   = [[x y 1 0 0 0 -x*X -y*X -X]
                   [0 0 0 x y 1 -x*Y -y*Y -Y]
                    ...
                    ...
                  ]
            x.T = [a0 a1 a2 b0 b1 b2 c0 c1 c3]

        In case of total least-squares the solution of this humogeneous system
        of equations is the right singular vector of A which corresponds to the
        smallest singular value normed by the coefficient c3.

        In case of the affine transformation the coefficients c0 and c1 are 0.
        Thus the system of equations is::

            A   = [[x y 1 0 0 0 -X]
                   [0 0 0 x y 1 -Y]
                    ...
                    ...
                  ]
            x.T = [a0 a1 a2 b0 b1 b2 c3]

        Parameters
        ----------
        src : (N, 2) array
            Source coordinates.
        dst : (N, 2) array
            Destination coordinates.

        Returns
        -------
        success : bool
            True, if model estimation succeeds.

        """

        try:
            src_matrix, src = _center_and_normalize_points(src)
            dst_matrix, dst = _center_and_normalize_points(dst)
        except ZeroDivisionError:
            self.params = np.nan * np.empty((3, 3))
            return False

        xs = src[:, 0]
        ys = src[:, 1]
        xd = dst[:, 0]
        yd = dst[:, 1]
        rows = src.shape[0]

        # params: a0, a1, a2, b0, b1, b2, c0, c1
        A = np.zeros((rows * 2, 9))
        A[:rows, 0] = xs
        A[:rows, 1] = ys
        A[:rows, 2] = 1
        A[:rows, 6] = - xd * xs
        A[:rows, 7] = - xd * ys
        A[rows:, 3] = xs
        A[rows:, 4] = ys
        A[rows:, 5] = 1
        A[rows:, 6] = - yd * xs
        A[rows:, 7] = - yd * ys
        A[:rows, 8] = xd
        A[rows:, 8] = yd

        # Select relevant columns, depending on params
        A = A[:, list(self._coeffs) + [8]]

        _, _, V = np.linalg.svd(A)

        H = np.zeros((3, 3))
        # solution is right singular vector that corresponds to smallest
        # singular value
        H.flat[list(self._coeffs) + [8]] = - V[-1, :-1] / V[-1, -1]
        H[2, 2] = 1

        # De-center and de-normalize
        H = np.dot(np.linalg.inv(dst_matrix), np.dot(H, src_matrix))

        self.params = H

        return True

Example 61

Project: bokeh Source File: test_comp_glyphs.py
def test_xyglyph_xy_range():
    def check_bounds(xyg, xmin=0, xmax=4, ymin=1, ymax=5):
        assert xyg.x_min == xmin
        assert xyg.x_max == xmax
        assert xyg.y_min == ymin
        assert xyg.y_max == ymax

    for Glyph in [LineGlyph, PointGlyph]:
        x = pd.Series([0, 1, 2, 3, 4])
        y = pd.Series([5, 4, 3, 2, 1])
        xyg = Glyph(x=x, y=y)
        check_bounds(xyg)

        x[1] = x[2] = np.nan
        xyg = Glyph(x=x, y=y)
        check_bounds(xyg)

        x[0] = np.nan
        xyg = Glyph(x=x, y=y)
        check_bounds(xyg, xmin=3)

        x[4] = np.nan
        xyg = Glyph(x=x, y=y)
        check_bounds(xyg, xmin=3, xmax=3)

        y[1] = y[2] = np.nan
        xyg = Glyph(x=x, y=y)
        check_bounds(xyg, xmin=3, xmax=3)

        y[0] = np.nan
        xyg = Glyph(x=x, y=y)
        check_bounds(xyg, xmin=3, xmax=3, ymax=2)

        y[4] = np.nan
        xyg = Glyph(x=x, y=y)
        check_bounds(xyg, xmin=3, xmax=3, ymax=2, ymin=2)

Example 62

Project: scikit-bio Source File: distance.py
@experimental(as_of='0.4.2')
def hamming(seq1, seq2):
    """Compute Hamming distance between two sequences.

    The Hamming distance between two equal-length sequences is the proportion
    of differing characters.

    Parameters
    ----------
    seq1, seq2 : Sequence
        Sequences to compute Hamming distance between.

    Returns
    -------
    float
        Hamming distance between `seq1` and `seq2`.

    Raises
    ------
    TypeError
        If `seq1` and `seq2` are not ``Sequence`` instances.
    TypeError
        If `seq1` and `seq2` are not the same type.
    ValueError
        If `seq1` and `seq2` are not the same length.

    See Also
    --------
    scipy.spatial.distance.hamming

    Notes
    -----
    ``np.nan`` will be returned if the sequences do not contain any characters.

    This function does not make assumptions about the sequence alphabet in use.
    Each sequence object's underlying sequence of characters are used to
    compute Hamming distance. Characters that may be considered equivalent in
    certain contexts (e.g., `-` and `.` as gap characters) are treated as
    distinct characters when computing Hamming distance.

    Examples
    --------
    >>> from skbio import Sequence
    >>> from skbio.sequence.distance import hamming
    >>> seq1 = Sequence('AGGGTA')
    >>> seq2 = Sequence('CGTTTA')
    >>> hamming(seq1, seq2)
    0.5

    """
    _check_seqs(seq1, seq2)

    # Hamming requires equal length sequences. We are checking this here
    # because the error you would get otherwise is cryptic.
    if len(seq1) != len(seq2):
        raise ValueError(
            "Hamming distance can only be computed between sequences of equal "
            "length (%d != %d)" % (len(seq1), len(seq2)))

    # scipy throws a RuntimeWarning when computing Hamming distance on length 0
    # input.
    if not seq1:
        distance = np.nan
    else:
        distance = scipy.spatial.distance.hamming(seq1.values, seq2.values)

    return float(distance)

Example 63

Project: sfepy Source File: plot_times.py
def main():
    parser = ArgumentParser(description=__doc__)
    parser.add_argument('--version', action='version', version='%(prog)s')
    parser.add_argument('-l', '--logarithmic',
                        action='store_true', dest='logarithmic',
                        default=False, help=helps['logarithmic'])
    parser.add_argument('filename')
    options = parser.parse_args()

    filename = options.filename

    plt.rcParams['lines.linewidth'] = 3
    plt.rcParams['lines.markersize'] = 9
    fontsize = 16

    steps, times, nts, dts = extract_times(filename)
    dts[-1] = nm.nan

    ax = plt.subplot(211)
    if options.logarithmic:
        l1, = ax.semilogy(steps, dts, 'b')
    else:
        l1, = ax.plot(steps, dts, 'b')
    ax.set_xlabel('step', fontsize=fontsize)
    ax.set_ylabel(r'$\Delta t$', fontsize=fontsize)
    ax.grid(True)

    ax = ax.twinx()
    l2, = ax.plot(steps, times, 'g')
    ax.set_ylabel(r'$t$', fontsize=fontsize)
    ax.legend([l1, l2], [r'$\Delta t$', r'$t$'], loc=0)

    ax = plt.subplot(212)
    if options.logarithmic:
        ax.semilogy(times, dts, 'b+')
    else:
        ax.plot(times, dts, 'b+')
    ax.set_xlabel(r'$t$', fontsize=fontsize)
    ax.set_ylabel(r'$\Delta t$', fontsize=fontsize)
    ax.grid(True)

    plt.show()

Example 64

Project: lifetimes Source File: utils.py
def summary_data_from_transaction_data(transactions, customer_id_col, datetime_col, monetary_value_col=None, datetime_format=None,
                                       observation_period_end=datetime.today(), freq='D'):
    """
    This transforms a Dataframe of transaction data of the form:
        customer_id, datetime [, monetary_value]
    to a Dataframe of the form:
        customer_id, frequency, recency, T [, monetary_value]
    Parameters:
        transactions: a Pandas DataFrame.
        customer_id_col: the column in transactions that denotes the customer_id
        datetime_col: the column in transactions that denotes the datetime the purchase was made.
        monetary_value_col: the columns in the transactions that denotes the monetary value of the transaction.
            Optional, only needed for customer lifetime value estimation models.
        observation_period_end: a string or datetime to denote the final date of the study. Events
            after this date are truncated.
        datetime_format: a string that represents the timestamp format. Useful if Pandas can't understand
            the provided format.
        freq: Default 'D' for days, 'W' for weeks, 'M' for months... etc. Full list here:
            http://pandas.pydata.org/pandas-docs/stable/timeseries.html#dateoffset-objects
    """
    observation_period_end = pd.to_datetime(observation_period_end, format=datetime_format).to_period(freq)

    # label all of the repeated transactions
    repeated_transactions = find_first_transactions(
        transactions,
        customer_id_col,
        datetime_col,
        monetary_value_col,
        datetime_format,
        observation_period_end,
        freq
    )
    # count all orders by customer.
    customers = repeated_transactions.groupby(customer_id_col, sort=False)[datetime_col].agg(['min', 'max', 'count'])

    # subtract 1 from count, as we ignore their first order.
    customers['frequency'] = customers['count'] - 1

    customers['T'] = (observation_period_end - customers['min'])
    customers['recency'] = (customers['max'] - customers['min'])

    summary_columns = ['frequency', 'recency', 'T']

    if monetary_value_col:
        # create an index of all the first purchases
        first_purchases = repeated_transactions[repeated_transactions['first']].index
        # by setting the monetary_value cells of all the first purchases to NaN,
        # those values will be excluded from the mean value calculation
        repeated_transactions.loc[first_purchases, monetary_value_col] = np.nan

        customers['monetary_value'] = repeated_transactions.groupby(customer_id_col)[monetary_value_col].mean().fillna(0)
        summary_columns.append('monetary_value')

    return customers[summary_columns].astype(float)

Example 65

Project: zipline Source File: requests_csv.py
    def load_df(self):
        df = self.fetch_data()

        if self.pre_func:
            df = self.pre_func(df)

        # Batch-convert the user-specifed date column into timestamps.
        df['dt'] = self.parse_date_str_series(
            self.date_format,
            self.timezone,
            df[self.date_column],
            self.data_frequency,
            self.trading_day,
        ).values

        # ignore rows whose dates we couldn't parse
        df = df[df['dt'].notnull()]

        if self.symbol is not None:
            df['sid'] = self.symbol
        elif self.finder:

            df.sort_values(by=self.symbol_column, inplace=True)

            # Pop the 'sid' column off of the DataFrame, just in case the user
            # has assigned it, and throw a warning
            try:
                df.pop('sid')
                warnings.warn(
                    "Assignment of the 'sid' column of a DataFrame is "
                    "not supported by Fetcher. The 'sid' column has been "
                    "overwritten.",
                    category=UserWarning,
                    stacklevel=2,
                )
            except KeyError:
                # There was no 'sid' column, so no warning is necessary
                pass

            # Fill entries for any symbols that don't require a date to
            # uniquely identify.  Entries for which multiple securities exist
            # are replaced with zeroes, while entries for which no asset
            # exists are replaced with NaNs.
            unique_symbols = df[self.symbol_column].unique()
            sid_series = pd.Series(
                data=map(self._lookup_unconflicted_symbol, unique_symbols),
                index=unique_symbols,
                name='sid',
            )
            df = df.join(sid_series, on=self.symbol_column)

            # Fill any zero entries left in our sid column by doing a lookup
            # using both symbol and the row date.
            conflict_rows = df[df['sid'] == 0]
            for row_idx, row in conflict_rows.iterrows():
                try:
                    asset = self.finder.lookup_symbol(
                        row[self.symbol_column],
                        # Replacing tzinfo here is necessary because of the
                        # timezone metadata bug described below.
                        row['dt'].replace(tzinfo=pytz.utc),

                        # It's possible that no asset comes back here if our
                        # lookup date is from before any asset held the
                        # requested symbol.  Mark such cases as NaN so that
                        # they get dropped in the next step.
                    ) or numpy.nan
                except SymbolNotFound:
                    asset = numpy.nan

                # Assign the resolved asset to the cell
                df.ix[row_idx, 'sid'] = asset

            # Filter out rows containing symbols that we failed to find.
            length_before_drop = len(df)
            df = df[df['sid'].notnull()]
            no_sid_count = length_before_drop - len(df)
            if no_sid_count:
                logger.warn(
                    "Dropped {} rows from fetched csv.".format(no_sid_count),
                    no_sid_count,
                    extra={'syslog': True},
                )
        else:
            df['sid'] = df['symbol']

        # Dates are localized to UTC when they come out of
        # parse_date_str_series, but we need to re-localize them here because
        # of a bug that wasn't fixed until
        # https://github.com/pydata/pandas/pull/7092.
        # We should be able to remove the call to tz_localize once we're on
        # pandas 0.14.0

        # We don't set 'dt' as the index until here because the Symbol parsing
        # operations above depend on having a unique index for the dataframe,
        # and the 'dt' column can contain multiple dates for the same entry.
        df.drop_duplicates(["sid", "dt"])
        df.set_index(['dt'], inplace=True)
        df = df.tz_localize('UTC')
        df.sort_index(inplace=True)

        cols_to_drop = [self.date_column]
        if self.symbol is None:
            cols_to_drop.append(self.symbol_column)
        df = df[df.columns.drop(cols_to_drop)]

        if self.post_func:
            df = self.post_func(df)

        return df

Example 66

Project: yatsm Source File: masking.py
def smooth_mask(x, Y, span, crit=400, green=1, swir1=4,
                maxiter=5):
    """ Multi-temporal masking using LOWESS

    Taken directly from newer version of CCDC than Zhu and Woodcock, 2014.
    This "temporal masking" replaced the older method which used robust
    linear models. This version uses a regular LOWESS instead of robust
    LOWESS

    .. note::

        "span" argument is the inverse of "frac" from statsmodels and is
        actually 'k' in their code:

        `n = x.shape[0]`
        `k = int(frac * n + 1e-10)`

    .. todo::

        We need to put the data on a regular period since span changes as
        is right now. Statsmodels will only allow for dropna, so we would
        need to impute missing data somehow...

    Args:
        x (np.ndarray): array of ordinal dates
        Y (np.ndarray): matrix of observed spectra
        span (int): span of LOWESS
        crit (float): critical value for masking clouds/shadows
        green (int): 0 indexed value for green band in Y (default: 1)
        swir1 (int): 0 indexed value for SWIR (~1.55-1.75um) band
            in Y (default: 4)
        maxiter (int): maximum increases to span when checking for
            NaN in LOWESS results

    Returns:
      mask (ndarray): mask where False indicates values to be masked

    """
    # Reverse span to get frac
    frac = span / x.shape[0]
    # Estimate delta as "good choice": delta = 0.01 * range(exog)
    delta = (x.max() - x.min()) * 0.01

    # Run LOWESS checking for NaN in output
    i = 0
    green_lowess, swir1_lowess = np.nan, np.nan
    while (np.any(np.isnan(green_lowess)) or
           np.any(np.isnan(swir1_lowess))) and i < maxiter:
        green_lowess = sm.nonparametric.lowess(Y[green, :], x,
                                               frac=frac, delta=delta)
        swir1_lowess = sm.nonparametric.lowess(Y[swir1, :], x,
                                               frac=frac, delta=delta)
        span += 1
        frac = span / x.shape[0]
        i += 1

    mask = (((Y[green, :] - green_lowess[:, 1]) < crit) *
            ((Y[swir1, :] - swir1_lowess[:, 1]) > -crit))

    return mask

Example 67

Project: trackpy Source File: test_perf.py
def profile_head_single(benchmark):
    import gc
    results = []

    # just in case
    gc.collect()

    try:
        from ctypes import cdll, CDLL
        cdll.LoadLibrary("libc.so.6")
        libc = CDLL("libc.so.6")
        libc.malloc_trim(0)
    except:
        pass


    N =  args.hrepeats + args.burnin

    results = []
    try:
        for i in range(N):
            gc.disable()
            d=dict()

            try:
                d = benchmark.run()

            except KeyboardInterrupt:
                raise
            except Exception as e: # if a single vbench bursts into flames, don't die.
                err=""
                try:
                    err =  d.get("traceback")
                    if err is None:
                        err = str(e)
                except:
                    pass
                print("%s died with:\n%s\nSkipping...\n" % (benchmark.name, err))

            results.append(d.get('timing',np.nan))
            gc.enable()
            gc.collect()

    finally:
        gc.enable()

    if results:
        # throw away the burn_in
        results = results[args.burnin:]
    sys.stdout.write('.')
    sys.stdout.flush()
    return Series(results, name=benchmark.name)

Example 68

Project: findatapy Source File: calculations.py
    def calculate_individual_trade_gains(self, signal_data_frame, strategy_returns_data_frame):
        """
        calculate_individual_trade_gains - Calculates profits on every trade

        Parameters
        ----------
        signal_data_frame : DataFrame
            trading signals
        strategy_returns_data_frame: DataFrame
            returns of strategy to be tested
        period_shift : int
            number of periods to shift signal

        Returns
        -------
        DataFrame
        """

        # signal need to be aligned to NEXT period for returns
        signal_data_frame_pushed = signal_data_frame.shift(1)

        # find all the trade points
        trade_points = ((signal_data_frame - signal_data_frame.shift(1)).abs())
        cuemulative = self.create_mult_index(strategy_returns_data_frame)

        indices = trade_points > 0
        indices.columns = cuemulative.columns

        # get P&L for every trade (from the end point - start point)
        trade_returns = numpy.nan * cuemulative
        trade_points_cuemulative = cuemulative[indices]

        # for each set of signals/returns, calculate the trade returns - where there isn't a trade
        # assign a NaN
        # TODO do in one vectorised step without for loop
        for col_name in trade_points_cuemulative:
            col = trade_points_cuemulative[col_name]
            col = col.dropna()
            col = col / col.shift(1) - 1

            # TODO experiment with quicker ways of writing below?
            # for val in col.index:
                # trade_returns.set_value(val, col_name, col[val])
                # trade_returns.ix[val, col_name] = col[val]

            date_indices = trade_returns.index.searchsorted(col.index)
            trade_returns.ix[date_indices, col_name] = col

        return trade_returns

Example 69

Project: empyrical Source File: stats.py
def alpha_aligned(returns, factor_returns, risk_free=0.0, period=DAILY,
                  annualization=None, _beta=None):
    """Calculates annualized alpha.

    If they are pd.Series, expects returns and factor_returns have already
    been aligned on their labels.  If np.ndarray, these arguments should have
    the same shape.

    Parameters
    ----------
    returns : pd.Series or np.ndarray
        Daily returns of the strategy, noncuemulative.
        - See full explanation in :func:`~empyrical.stats.cuem_returns`.
    factor_returns : pd.Series or np.ndarray
         Daily noncuemulative returns of the factor to which beta is
         computed. Usually a benchmark such as the market.
         - This is in the same style as returns.
    risk_free : int, float, optional
        Constant risk-free return throughout the period. For example, the
        interest rate on a three month us treasury bill.
    period : str, optional
        Defines the periodicity of the 'returns' data for purposes of
        annualizing. Value ignored if `annualization` parameter is specified.
        Defaults are:
            'monthly':12
            'weekly': 52
            'daily': 252
    annualization : int, optional
        Used to suppress default values available in `period` to convert
        returns into annual returns. Value should be the annual frequency of
        `returns`.
        - See full explanation in :func:`~empyrical.stats.annual_return`.
    _beta : float, optional
        The beta for the given inputs, if already known. Will be calculated
        internally if not provided.

    Returns
    -------
    float
        Alpha.
    """
    if len(returns) < 2:
        return np.nan

    ann_factor = annualization_factor(period, annualization)

    if _beta is None:
        _beta = beta_aligned(returns, factor_returns, risk_free)

    adj_returns = _adjust_returns(returns, risk_free)
    adj_factor_returns = _adjust_returns(factor_returns, risk_free)
    alpha_series = adj_returns - (_beta * adj_factor_returns)

    return nanmean(alpha_series) * ann_factor

Example 70

Project: refinery Source File: SuffStatBag.py
  def mergeComps(self, kA, kB):
    ''' Merge components kA, kB into a single component
    '''
    if self.K <= 1:
      raise ValueError('Must have at least 2 components to merge.')
    if kB == kA:
      raise ValueError('Distinct component ids required.')
    for key, dims in self._Fields._FieldDims.items():
      if dims is not None and dims != ():
        arr = getattr(self._Fields, key)
        if self.hasMergeTerm(key) and dims == ('K'):
          # some special terms need to be precomputed, like sumLogPiActive
          arr[kA] = getattr(self._MergeTerms, key)[kA,kB]
        else:
          # applies to vast majority of all fields
          arr[kA] += arr[kB]

    if self.hasELBOTerms():
      for key, dims in self._ELBOTerms._FieldDims.items():
        if self.hasMergeTerm(key) and dims == ('K'):
          arr = getattr(self._ELBOTerms, key)
          mArr = getattr(self._MergeTerms, key)
          arr[kA] = mArr[kA,kB]

    if self.hasMergeTerms():
      for key, dims in self._MergeTerms._FieldDims.items():
        if dims == ('K','K'):
          mArr = getattr(self._MergeTerms, key)
          mArr[kA,kA+1:] = np.nan
          mArr[:kA,kA] = np.nan

    self._Fields.removeComp(kB)
    if self.hasELBOTerms():
      self._ELBOTerms.removeComp(kB)
    if self.hasMergeTerms():
      self._MergeTerms.removeComp(kB)

Example 71

Project: pyNastran Source File: elements.py
    def get_mass_by_element_id(self, element_ids_orig=None, xyz_cid0=None, sort_output=True):
        if xyz_cid0 is None:
            xyz_cid0 = self.model.grid.get_position_by_node_index()

        element_ids, element_ids_orig = self._get_element_ids(element_ids_orig)
        if len(element_ids) == 0:
            nelements = len(element_ids_orig)
            mass = np.full(nelements, np.nan, 'float64')
            if sort_output:
                i = np.argsort(element_ids_orig)
                #print("i =", i, i.shape)
                #print("element_ids_orig =", element_ids_orig, element_ids_orig.shape)
                return element_ids_orig[i], mass
            else:
                return element_ids_orig, mass
        nelements_orig = len(element_ids_orig)
        #print('eids orig = %s' % element_ids_orig)

        type_map = {
            #'CELAS1' : self.elements_spring.celas1,
            'CELAS2' : self.elements_spring.celas2,
            'CELAS3' : self.elements_spring.celas3,
            'CELAS4' : self.elements_spring.celas4,

            'CBAR'  : self.cbar,
            'CBEAM' : self.cbeam,

            'CROD' : self.crod,
            'CONROD' : self.conrod,
            'CTUBE' : self.ctube,
            'CSHEAR' : self.cshear,

            'CQUAD4' : self.elements_shell.cquad4,
            'CTRIA3' : self.elements_shell.ctria3,

            'CTETRA4' : self.elements_solid.ctetra4,
            'CPENTA6' : self.elements_solid.cpenta6,
            'CHEXA8' : self.elements_solid.chexa8,

            'CTETRA10' : self.elements_solid.ctetra10,
            'CPENTA15' : self.elements_solid.cpenta15,
            'CHEXA20' : self.elements_solid.chexa20,
        }

        exclude_types = [
            'CELAS1', 'CELAS2', 'CELAS3', 'CELAS4',
            'CDAMP1', 'CDAMP2', 'CDAMP3', 'CDAMP4',
            'CMASS1', 'CMASS2', 'CMASS3', 'CMASS4',
            'CBUSH', 'CBUSH1D', 'CBUSH2D',
            ]
        pid_data = self.get_element_ids_by_property_type(element_ids, exclude_types=exclude_types)
        element_ids_to_analyze = pid_data[:, 0]
        data2 = self.group_elements_by_property_type_and_element_type(self, pid_data)

        #self.model.log.debug('**data2 = %s' % data2)
        nelements = len(element_ids)
        #self.model.log.debug('nelement_ids =', nelements)
        eids2 = np.zeros(nelements_orig, dtype='int32')
        #mass = full(nelements, nan, dtype='float64')
        mass = np.full(nelements_orig, np.nan, dtype='float64')
        #self.model.log.debug('mass.shape =', mass.shape)

        ni = 0
        self.model.log.debug('data2 = %s' % data2)
        for (pid, etype), element_ids in iteritems(data2):
            #self.model.log.debug('pid=%s eType=%s element_ids=%s' % (pid, eType, element_ids))
            elements = type_map[etype]
            i = np.searchsorted(elements.element_id, element_ids)
            n = len(i)
            eids2[ni:ni+n] = elements.element_id[i]
            if pid == 0:
                # CONROD
                pass
            else:
                self.model.log.debug('*cat pid = %s' % pid)
                props = self.get_properties([pid])
                if len(props) == 0:
                    # the property doesn't exist
                    self.model.log.debug('Property %i does not exist and is needed by %s eid=%i' % (
                        pid, etype, element_ids[0]))
                    ni += n
                    #print('props = %s' % props)
                    continue

                # we only get one property at a time
                prop = props[0]
                #print('  prop = %s' % str(prop).rstrip())

            elements = type_map[etype]
            i = np.searchsorted(elements.element_id, element_ids)
            n = len(i)
            #print('ielements = %s' % i)

            if etype in ['CELAS1', 'CELAS2', 'CELAS3', 'CELAS4',]:
                pass
            elif etype in ['CROD', 'CONROD', 'CBAR', 'CBEAM']:
                msg = 'which is required for %ss' % etype
                n1 = self.get_nodes(elements.node_ids[i, 0], xyz_cid0, msg=msg)
                n2 = self.get_nodes(elements.node_ids[i, 1], xyz_cid0, msg=msg)
                length = np.linalg.norm(n2 - n1, axis=1)
                #print('prop = %s' % prop)
                #print('  calling get_mass_per_area for pid=%s' % (pid))
                if etype == 'CONROD':
                    rho = self.model.materials.get_density_by_material_id(elements.material_id)
                    mass[ni:ni+n] = length * elements.A[i] * rho[i]  + elements.nsm[i]
                else:
                    mpl = prop.get_mass_per_length_by_property_id()
                    mass[ni:ni+n] = mpl * length
                    del prop
            elif etype in ['CTRIA3', 'CQUAD4', 'CSHEAR']:
                if etype == 'CTRIA3':
                    n1, n2, n3 = (elements.node_ids[i, 0], elements.node_ids[i, 1],
                                  elements.node_ids[i, 2])
                    n1 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n1), :]
                    n2 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n2), :]
                    n3 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n3), :]
                    area = _ctria3_normal_A(n1, n2, n3,
                                            calculate_area=True, normalize=False)[1]
                elif etype in ['CQUAD4', 'CSHEAR']:
                    n1, n2, n3, n4 = (elements.node_ids[i, 0], elements.node_ids[i, 1],
                                      elements.node_ids[i, 2], elements.node_ids[i, 3])
                    n1 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n1), :]
                    n2 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n2), :]
                    n3 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n3), :]
                    n4 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n4), :]
                    area = _cquad4_normal_A(n1, n2, n3, n4,
                                            calculate_area=True, normalize=False)[1]
                else:
                    self.model.log.debug("Element.get_mass doesn't support %s; try %s.get_mass" % (
                        etype, etype))
                    ni += n
                    continue
                #print('prop = %s' % prop)
                #print('  calling get_mass_per_area for pid=%s' % (pid))
                mpa = prop.get_mass_per_area_by_property_id()
                mass[ni:ni+n] = mpa * area
                del prop
            elif etype in ['CTETRA4', 'CTETRA10', 'CPENTA6', 'CPENTA15', 'CHEXA8', 'CHEXA20']:
                rho = prop.get_density_by_property_id()
                del prop
                if etype in ['CTETRA4', 'CTETRA10']:
                    n1, n2, n3, n4 = (elements.node_ids[i, 0], elements.node_ids[i, 1],
                                      elements.node_ids[i, 2], elements.node_ids[i, 3])
                    n1 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n1), :]
                    n2 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n2), :]
                    n3 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n3), :]
                    n4 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n4), :]

                    volumei = np.zeros(n, self.model.float_fmt)
                    i = 0
                    for n1i, n2i, n3i, n4i in zip(n1, n2, n3, n4):
                        volumei[i] = volume4(n1i, n2i, n3i, n4i)
                        i += 1
                elif etype in ['CPENTA6', 'CPENTA15']:
                    n1, n2, n3, n4, n5, n6 = (elements.node_ids[i, 0], elements.node_ids[i, 1],
                                              elements.node_ids[i, 2], elements.node_ids[i, 3],
                                              elements.node_ids[i, 4], elements.node_ids[i, 5], )
                    n1 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n1), :]
                    n2 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n2), :]
                    n3 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n3), :]
                    n4 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n4), :]
                    n5 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n5), :]
                    n6 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n6), :]
                    (area1, centroid1) = tri_area_centroid(n1, n2, n3)
                    (area2, centroid2) = tri_area_centroid(n4, n5, n6)
                    volumei = (area1 + area2) / 2. * np.linalg.norm(centroid1 - centroid2, axis=1)
                elif etype in ['CHEXA8', 'CHEXA20']:
                    n1, n2, n3, n4, n5, n6, n7, n8 = (
                        elements.node_ids[i, 0], elements.node_ids[i, 1],
                        elements.node_ids[i, 2], elements.node_ids[i, 3],
                        elements.node_ids[i, 4], elements.node_ids[i, 5],
                        elements.node_ids[i, 6], elements.node_ids[i, 7], )
                    n1 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n1), :]
                    n2 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n2), :]
                    n3 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n3), :]
                    n4 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n4), :]
                    n5 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n5), :]
                    n6 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n6), :]
                    n7 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n7), :]
                    n8 = xyz_cid0[self.model.grid.get_node_index_by_node_id(n8), :]

                    (area1, centroid1) = quad_area_centroid(n1, n2, n3, n4)
                    (area2, centroid2) = quad_area_centroid(n5, n6, n7, n8)
                    volumei = (area1 + area2) / 2. * np.linalg.norm(centroid1 - centroid2, axis=1)
                else:
                    msg = "Element.get_mass doesn't support %s; try %s.get_mass" % (etype, etype)
                    self.model.log.debug(msg)
                    ni += n
                    continue
                mass[ni:ni+n] = volumei * rho
            else:
                msg = "  Element.get_mass doesn't support %s; try %s.get_mass" % (etype, etype)
                self.model.log.debug(msg)
                msg = "Element.get_mass doesn't support %s; try %s.get_mass" % (etype, etype)
                raise NotImplementedError(msg)
            #self.model.log.debug("")
            ni += n
        #self.model.log.debug('data2 = %s' % data2)
        diff = np.setdiff1d(element_ids_orig, element_ids_to_analyze)
        #print('A = %s' % element_ids_orig)
        #print('B = %s' % element_ids_to_analyze)
        #print('A-B = %s' % (diff))
        #print('eids2 = %s' % eids2)
        #if len(diff):
            #print('cuemdiff = %s' % list(diff))
            #print('eids out = %s' % eids2)
            #print('len(eids2) =', len(eids2))
            #print('len(diff) =', len(diff))
        eids2[ni:] = diff
        if sort_output:
            i = np.argsort(eids2)
            eids2 = eids2[i]
            mass = mass[i]
        return eids2, mass

Example 72

Project: control Source File: read_path.py
Function: get_sequence
def get_sequence(sequence, writebox, spaces=False):
    """Returns a sequence 

    sequence list: the sequence of integers
    writebox list: [min x, max x, min y, max y]
    """

    nans = np.array([np.nan, np.nan])
    nums= nans.copy()

    if spaces is False:
        each_num_width = (writebox[1] - writebox[0]) / float(len(sequence))
    else: 
        each_num_width = (writebox[1] - writebox[0]) / float(len(sequence)*2 - 1)

    for ii, nn in enumerate(sequence):

        if spaces is False:
            num_writebox = [writebox[0] + each_num_width * ii , 
                            writebox[0] + each_num_width * (ii+1), 
                            writebox[2], writebox[3]]
        else:
            num_writebox = [writebox[0] + each_num_width * 2 * ii , 
                            writebox[0] + each_num_width * 2 * (ii+.5), 
                            writebox[2], writebox[3]]
        if isinstance(nn, int):
            nn = str(nn)
        num = get_raw_data(nn, num_writebox)
        nums = np.vstack([nums, num, nans])

    return nums 

Example 73

Project: zipline Source File: continuous_future_reader.py
Function: load_raw_arrays
    def load_raw_arrays(self, columns, start_date, end_date, assets):
        """
        Parameters
        ----------
        fields : list of str
            'sid'
        start_dt: Timestamp
           Beginning of the window range.
        end_dt: Timestamp
           End of the window range.
        sids : list of int
           The asset identifiers in the window.

        Returns
        -------
        list of np.ndarray
            A list with an entry per field of ndarrays with shape
            (minutes in range, sids) with a dtype of float64, containing the
            values for the respective field over start and end dt range.
        """
        rolls_by_asset = {}
        for asset in assets:
            rf = self._roll_finders[asset.roll_style]
            rolls_by_asset[asset] = rf.get_rolls(
                asset.root_symbol, start_date, end_date, asset.offset)
        num_sessions = len(
            self.trading_calendar.sessions_in_range(start_date, end_date))
        shape = num_sessions, len(assets)

        results = []

        tc = self._bar_reader.trading_calendar
        sessions = tc.sessions_in_range(start_date, end_date)

        # Get partitions
        partitions_by_asset = {}
        for asset in assets:
            rolls_by_asset[asset] = rf.get_rolls(
                asset.root_symbol, start_date, end_date, asset.offset)
            partitions = []
            partitions_by_asset[asset] = partitions
            rolls = rolls_by_asset[asset]
            start = start_date
            for roll in rolls:
                sid, roll_date = roll
                start_loc = sessions.get_loc(start)
                if roll_date is not None:
                    end = roll_date - sessions.freq
                    end_loc = sessions.get_loc(end)
                else:
                    end = end_date
                    end_loc = len(sessions) - 1
                partitions.append((sid, start, end, start_loc, end_loc))
                if roll[-1] is not None:
                    start = sessions[end_loc + 1]

        for column in columns:
            if column != 'volume' and column != 'sid':
                out = np.full(shape, np.nan)
            else:
                out = np.zeros(shape, dtype=np.int64)
            for i, asset in enumerate(assets):
                partitions = partitions_by_asset[asset]
                for sid, start, end, start_loc, end_loc in partitions:
                    if column != 'sid':
                        result = self._bar_reader.load_raw_arrays(
                            [column], start, end, [sid])[0][:, 0]
                    else:
                        result = int(sid)
                    out[start_loc:end_loc + 1, i] = result
            results.append(out)
        return results

Example 74

Project: verif Source File: Input.py
   def __init__(self, filename):
      import csv
      Input.__init__(self, filename)
      file = open(filename, 'r')
      self._units = "Unknown units"
      self._variable = "Unknown"
      self._pit = None

      self._dates = set()
      self._offsets = set()
      self._stations = set()
      self._quantiles = set()
      self._thresholds = set()
      fields = dict()
      obs = dict()
      fcst = dict()
      cdf = dict()
      pit = dict()
      x = dict()
      indices = dict()
      header = None

      # Default values if columns not available
      offset = 0
      date = 0
      lat = 0
      lon = 0
      elev = 0
      # Store station data, to ensure we don't have conflicting lat/lon/elev info for the same ids
      stationInfo = dict()
      shownConflictingWarning = False

      import time
      start = time.time()
      # Read the data into dictionary with (date,offset,lat,lon,elev) as key and obs/fcst as values
      for rowstr in file:
         if(rowstr[0] == "#"):
            curr = rowstr[1:]
            curr = curr.split()
            if(curr[0] == "variable:"):
               self._variable = ' '.join(curr[1:])
            elif(curr[0] == "units:"):
               self._units = curr[1]
            else:
               Util.warning("Ignoring line '" + rowstr.strip() + "' in file '" + filename + "'")
         else:
            row = rowstr.split()
            if(header is None):
               # Parse the header so we know what each column represents
               header = row
               for i in range(0, len(header)):
                  att = header[i]
                  if(att == "date"):
                     indices["date"] = i
                  elif(att == "offset"):
                     indices["offset"] = i
                  elif(att == "lat"):
                     indices["lat"] = i
                  elif(att == "lon"):
                     indices["lon"] = i
                  elif(att == "elev"):
                     indices["elev"] = i
                  elif(att == "obs"):
                     indices["obs"] = i
                  elif(att == "fcst"):
                     indices["fcst"] = i
                  else:
                     indices[att] = i

               # Ensure we have required columns
               requiredColumns = ["obs", "fcst"]
               for col in requiredColumns:
                  if(col not in indices):
                     msg = "Could not parse %s: Missing column '%s'" % (filename, col)
                     Util.error(msg)
            else:
               if(len(row) is not len(header)):
                  Util.error("Incorrect number of columns (expecting %d) in row '%s'"
                        % (len(header), rowstr.strip()))
               if("date" in indices):
                  date = self._clean(row[indices["date"]])
               self._dates.add(date)
               if("offset" in indices):
                  offset = self._clean(row[indices["offset"]])
               self._offsets.add(offset)
               if("id" in indices):
                  id = self._clean(row[indices["id"]])
               else:
                  id = np.nan

               # Lookup previous stationInfo
               currLat = np.nan
               currLon = np.nan
               currElev = np.nan
               if("lat" in indices):
                  currLat = self._clean(row[indices["lat"]])
               if("lon" in indices):
                  currLon = self._clean(row[indices["lon"]])
               if("elev" in indices):
                  currElev = self._clean(row[indices["elev"]])
               if not np.isnan(id) and id in stationInfo:
                  lat = stationInfo[id].lat()
                  lon = stationInfo[id].lon()
                  elev = stationInfo[id].elev()
                  if not shownConflictingWarning:
                     if (not np.isnan(currLat) and abs(currLat - lat) > 0.0001) or (not np.isnan(currLon) and abs(currLon - lon) > 0.0001) or (not np.isnan(currElev) and abs(currElev - elev) > 0.001):
                        print currLat - lat, currLon - lon, currElev - elev
                        Util.warning("Conflicting lat/lon/elev information: (%f,%f,%f) does not match (%f,%f,%f)" % (currLat, currLon, currElev, lat, lon, elev))
                        shownConflictingWarning = True
               else:
                  if np.isnan(currLat):
                     currLat = 0
                  if np.isnan(currLon):
                     currLon = 0
                  if np.isnan(currElev):
                     currElev = 0
                  station = Station.Station(id, currLat, currLon, currElev)
                  self._stations.add(station)
                  stationInfo[id] = station

               lat = stationInfo[id].lat()
               lon = stationInfo[id].lon()
               elev = stationInfo[id].elev()
               key = (date, offset, lat, lon, elev)
               obs[key] = self._clean(row[indices["obs"]])
               fcst[key] = self._clean(row[indices["fcst"]])
               quantileFields = self._getQuantileFields(header)
               thresholdFields = self._getThresholdFields(header)
               if "pit" in indices:
                  pit[key] = self._clean(row[indices["pit"]])
               for field in quantileFields:
                  quantile = float(field[1:])
                  self._quantiles.add(quantile)
                  key = (date, offset, lat, lon, elev, quantile)
                  x[key] = self._clean(row[indices[field]])
               for field in thresholdFields:
                  threshold = float(field[1:])
                  self._thresholds.add(threshold)
                  key = (date, offset, lat, lon, elev, threshold)
                  cdf[key] = self._clean(row[indices[field]])

      end = time.time()
      file.close()
      self._dates = list(self._dates)
      self._offsets = list(self._offsets)
      self._stations = list(self._stations)
      self._quantiles = list(self._quantiles)
      self._thresholds = np.array(list(self._thresholds))
      Ndates = len(self._dates)
      Noffsets = len(self._offsets)
      Nlocations = len(self._stations)
      Nquantiles = len(self._quantiles)
      Nthresholds = len(self._thresholds)

      # Put the dictionary data into a regular 3D array
      self._obs = np.zeros([Ndates, Noffsets, Nlocations], 'float') * np.nan
      self._fcst = np.zeros([Ndates, Noffsets, Nlocations], 'float') * np.nan
      if(len(pit) != 0):
         self._pit = np.zeros([Ndates, Noffsets, Nlocations], 'float') * np.nan
      self._cdf = np.zeros([Ndates, Noffsets, Nlocations, Nthresholds], 'float') * np.nan
      self._x = np.zeros([Ndates, Noffsets, Nlocations, Nquantiles], 'float') * np.nan
      for d in range(0, len(self._dates)):
         date = self._dates[d]
         end = time.time()
         for o in range(0, len(self._offsets)):
            offset = self._offsets[o]
            for s in range(0, len(self._stations)):
               station = self._stations[s]
               lat = station.lat()
               lon = station.lon()
               elev = station.elev()
               key = (date, offset, lat, lon, elev)
               if(key in obs):
                  self._obs[d][o][s] = obs[key]
               if(key in fcst):
                  self._fcst[d][o][s] = fcst[key]
               if(key in pit):
                  self._pit[d][o][s] = pit[key]
               for q in range(0, len(self._quantiles)):
                  quantile = self._quantiles[q]
                  key = (date, offset, lat, lon, elev, quantile)
                  if(key in x):
                     self._x[d, o, s, q] = x[key]
               for t in range(0, len(self._thresholds)):
                  threshold = self._thresholds[t]
                  key = (date, offset, lat, lon, elev, threshold)
                  if(key in cdf):
                     self._cdf[d, o, s, t] = cdf[key]
      end = time.time()
      maxStationId = np.nan
      for station in self._stations:
         if(np.isnan(maxStationId)):
            maxStationId = station.id()
         elif(station.id() > maxStationId):
            maxStationId = station.id()

      counter = 0
      if(not np.isnan(maxStationId)):
         counter = maxStationId + 1

      for station in self._stations:
         if(np.isnan(station.id())):
            station.id(counter)
            counter = counter + 1
      self._dates = np.array(self._dates)
      self._offsets = np.array(self._offsets)

Example 75

Project: scikit-learn Source File: test_ranking.py
Function: test_roc_curve_toydata
def test_roc_curve_toydata():
    # Binary classification
    y_true = [0, 1]
    y_score = [0, 1]
    tpr, fpr, _ = roc_curve(y_true, y_score)
    roc_auc = roc_auc_score(y_true, y_score)
    assert_array_almost_equal(tpr, [0, 1])
    assert_array_almost_equal(fpr, [1, 1])
    assert_almost_equal(roc_auc, 1.)

    y_true = [0, 1]
    y_score = [1, 0]
    tpr, fpr, _ = roc_curve(y_true, y_score)
    roc_auc = roc_auc_score(y_true, y_score)
    assert_array_almost_equal(tpr, [0, 1, 1])
    assert_array_almost_equal(fpr, [0, 0, 1])
    assert_almost_equal(roc_auc, 0.)

    y_true = [1, 0]
    y_score = [1, 1]
    tpr, fpr, _ = roc_curve(y_true, y_score)
    roc_auc = roc_auc_score(y_true, y_score)
    assert_array_almost_equal(tpr, [0, 1])
    assert_array_almost_equal(fpr, [0, 1])
    assert_almost_equal(roc_auc, 0.5)

    y_true = [1, 0]
    y_score = [1, 0]
    tpr, fpr, _ = roc_curve(y_true, y_score)
    roc_auc = roc_auc_score(y_true, y_score)
    assert_array_almost_equal(tpr, [0, 1])
    assert_array_almost_equal(fpr, [1, 1])
    assert_almost_equal(roc_auc, 1.)

    y_true = [1, 0]
    y_score = [0.5, 0.5]
    tpr, fpr, _ = roc_curve(y_true, y_score)
    roc_auc = roc_auc_score(y_true, y_score)
    assert_array_almost_equal(tpr, [0, 1])
    assert_array_almost_equal(fpr, [0, 1])
    assert_almost_equal(roc_auc, .5)

    y_true = [0, 0]
    y_score = [0.25, 0.75]
    # assert UndefinedMetricWarning because of no positive sample in y_true
    tpr, fpr, _ = assert_warns(UndefinedMetricWarning, roc_curve, y_true, y_score)
    assert_raises(ValueError, roc_auc_score, y_true, y_score)
    assert_array_almost_equal(tpr, [0., 0.5, 1.])
    assert_array_almost_equal(fpr, [np.nan, np.nan, np.nan])

    y_true = [1, 1]
    y_score = [0.25, 0.75]
    # assert UndefinedMetricWarning because of no negative sample in y_true
    tpr, fpr, _ = assert_warns(UndefinedMetricWarning, roc_curve, y_true, y_score)
    assert_raises(ValueError, roc_auc_score, y_true, y_score)
    assert_array_almost_equal(tpr, [np.nan, np.nan])
    assert_array_almost_equal(fpr, [0.5, 1.])

    # Multi-label classification task
    y_true = np.array([[0, 1], [0, 1]])
    y_score = np.array([[0, 1], [0, 1]])
    assert_raises(ValueError, roc_auc_score, y_true, y_score, average="macro")
    assert_raises(ValueError, roc_auc_score, y_true, y_score,
                  average="weighted")
    assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 1.)
    assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 1.)

    y_true = np.array([[0, 1], [0, 1]])
    y_score = np.array([[0, 1], [1, 0]])
    assert_raises(ValueError, roc_auc_score, y_true, y_score, average="macro")
    assert_raises(ValueError, roc_auc_score, y_true, y_score,
                  average="weighted")
    assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 0.5)
    assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 0.5)

    y_true = np.array([[1, 0], [0, 1]])
    y_score = np.array([[0, 1], [1, 0]])
    assert_almost_equal(roc_auc_score(y_true, y_score, average="macro"), 0)
    assert_almost_equal(roc_auc_score(y_true, y_score, average="weighted"), 0)
    assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), 0)
    assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), 0)

    y_true = np.array([[1, 0], [0, 1]])
    y_score = np.array([[0.5, 0.5], [0.5, 0.5]])
    assert_almost_equal(roc_auc_score(y_true, y_score, average="macro"), .5)
    assert_almost_equal(roc_auc_score(y_true, y_score, average="weighted"), .5)
    assert_almost_equal(roc_auc_score(y_true, y_score, average="samples"), .5)
    assert_almost_equal(roc_auc_score(y_true, y_score, average="micro"), .5)

Example 76

Project: blaze Source File: test_pandas_compute.py
def test_outer_join():
    left = [(1, 'Alice', 100),
            (2, 'Bob', 200),
            (4, 'Dennis', 400)]
    left = DataFrame(left, columns=['id', 'name', 'amount'])

    right = [('NYC', 1),
             ('Boston', 1),
             ('LA', 3),
             ('Moscow', 4)]
    right = DataFrame(right, columns=['city', 'id'])

    lsym = symbol('lsym', 'var * {id: int, name: string, amount: real}')
    rsym = symbol('rsym', 'var * {city: string, id: int}')

    convert = lambda df: set(df.to_records(index=False).tolist())

    assert (convert(compute(join(lsym, rsym), {lsym: left, rsym: right})) ==
            set([(1, 'Alice', 100, 'NYC'),
                 (1, 'Alice', 100, 'Boston'),
                 (4, 'Dennis', 400, 'Moscow')]))

    assert (convert(compute(join(lsym, rsym, how='left'),
                            {lsym: left, rsym: right})) ==
            set([(1, 'Alice', 100, 'NYC'),
                 (1, 'Alice', 100, 'Boston'),
                 (2, 'Bob', 200, np.nan),
                 (4, 'Dennis', 400, 'Moscow')]))

    df = compute(join(lsym, rsym, how='right'), {lsym: left, rsym: right})
    expected = DataFrame([(1., 'Alice', 100., 'NYC'),
                          (1., 'Alice', 100., 'Boston'),
                          (3., np.nan, np.nan, 'lsymA'),
                          (4., 'Dennis', 400., 'Moscow')],
                         columns=['id', 'name', 'amount', 'city'])

    result = pdsort(df, 'id').to_records(index=False)
    expected = pdsort(expected, 'id').to_records(index=False)
    np.array_equal(result, expected)

    df = compute(join(lsym, rsym, how='outer'), {lsym: left, rsym: right})
    expected = DataFrame([(1., 'Alice', 100., 'NYC'),
                          (1., 'Alice', 100., 'Boston'),
                          (2., 'Bob', 200., np.nan),
                          (3., np.nan, np.nan, 'LA'),
                          (4., 'Dennis', 400., 'Moscow')],
                         columns=['id', 'name', 'amount', 'city'])

    result = pdsort(df, 'id').to_records(index=False)
    expected = pdsort(expected, 'id').to_records(index=False)
    np.array_equal(result, expected)

Example 77

Project: scikit-bio Source File: distance.py
@experimental(as_of='0.5.0')
def kmer_distance(seq1, seq2, k, overlap=True):
    """Compute the kmer distance between a pair of sequences

    The kmer distance between two sequences is the fraction of kmers that are
    unique to either sequence.

    Parameters
    ----------
    seq1, seq2 : Sequence
        Sequences to compute kmer distance between.
    k : int
        The kmer length.
    overlap : bool, optional
        Defines whether the kmers should be overlapping or not.

    Returns
    -------
    float
        kmer distance between `seq1` and `seq2`.

    Raises
    ------
    ValueError
        If `k` is less than 1.
    TypeError
        If `seq1` and `seq2` are not ``Sequence`` instances.
    TypeError
        If `seq1` and `seq2` are not the same type.

    Notes
    -----
    kmer counts are not incorporated in this distance metric.

    ``np.nan`` will be returned if there are no kmers defined for the
    sequences.

    Examples
    --------
    >>> from skbio import Sequence
    >>> seq1 = Sequence('ATCGGCGAT')
    >>> seq2 = Sequence('GCAGATGTG')
    >>> kmer_distance(seq1, seq2, 3) # doctest: +ELLIPSIS
    0.9230769230...

    """
    _check_seqs(seq1, seq2)
    seq1_kmers = set(map(str, seq1.iter_kmers(k, overlap=overlap)))
    seq2_kmers = set(map(str, seq2.iter_kmers(k, overlap=overlap)))
    all_kmers = seq1_kmers | seq2_kmers
    if not all_kmers:
        return np.nan
    shared_kmers = seq1_kmers & seq2_kmers
    number_unique = len(all_kmers) - len(shared_kmers)
    fraction_unique = number_unique / len(all_kmers)
    return fraction_unique

Example 78

Project: iris Source File: __init__.py
def _regularise(grib_message):
    """
    Transform a reduced grid to a regular grid using interpolation.

    Uses 1d linear interpolation at constant latitude to make the grid
    regular. If the longitude dimension is circular then this is taken
    into account by the interpolation. If the longitude dimension is not
    circular then extrapolation is allowed to make sure all end regular
    grid points get a value. In practice this extrapolation is likely to
    be minimal.

    """
    # Make sure to read any missing values as NaN.
    gribapi.grib_set_double(grib_message, "missingValue", np.nan)

    # Get full longitude values, these describe the longitude value of
    # *every* point in the grid, they are not 1d monotonic coordinates.
    lons = gribapi.grib_get_double_array(grib_message, "longitudes")

    # Compute the new longitude coordinate for the regular grid.
    new_nx = max(gribapi.grib_get_long_array(grib_message, "pl"))
    new_x_step = (max(lons) - min(lons)) / (new_nx - 1)
    if gribapi.grib_get_long(grib_message, "iScansNegatively"):
        new_x_step *= -1

    new_lons = np.arange(new_nx) * new_x_step + lons[0]
    # Get full latitude and data values, these describe the latitude and
    # data values of *every* point in the grid, they are not 1d monotonic
    # coordinates.
    lats = gribapi.grib_get_double_array(grib_message, "latitudes")
    values = gribapi.grib_get_double_array(grib_message, "values")

    # Retrieve the distinct latitudes from the GRIB message. GRIBAPI docs
    # don't specify if these points are guaranteed to be oriented correctly so
    # the safe option is to sort them into ascending (south-to-north) order
    # and then reverse the order if necessary.
    new_lats = gribapi.grib_get_double_array(grib_message, "distinctLatitudes")
    new_lats.sort()
    if not gribapi.grib_get_long(grib_message, "jScansPositively"):
        new_lats = new_lats[::-1]
    ny = new_lats.shape[0]

    # Use 1d linear interpolation along latitude circles to regularise the
    # reduced data.
    cyclic = _longitude_is_cyclic(new_lons)
    new_values = np.empty([ny, new_nx], dtype=values.dtype)
    for ilat, lat in enumerate(new_lats):
        idx = np.where(lats == lat)
        llons = lons[idx]
        vvalues = values[idx]
        if cyclic:
            # For cyclic data we insert dummy points at each end to ensure
            # we can interpolate to all output longitudes using pure
            # interpolation.
            cgap = (360 - llons[-1] - llons[0])
            llons = np.concatenate(
                (llons[0:1] - cgap, llons, llons[-1:] + cgap))
            vvalues = np.concatenate(
                (vvalues[-1:], vvalues, vvalues[0:1]))
            fixed_latitude_interpolator = scipy.interpolate.interp1d(
                llons, vvalues)
        else:
            # Allow extrapolation for non-cyclic data sets to ensure we can
            # interpolate to all output longitudes.
            fixed_latitude_interpolator = Linear1dExtrapolator(
                scipy.interpolate.interp1d(llons, vvalues))
        new_values[ilat] = fixed_latitude_interpolator(new_lons)
    new_values = new_values.flatten()

    # Set flags for the regularised data.
    if np.isnan(new_values).any():
        # Account for any missing data.
        gribapi.grib_set_double(grib_message, "missingValue", np.inf)
        gribapi.grib_set(grib_message, "bitmapPresent", 1)
        new_values = np.where(np.isnan(new_values), np.inf, new_values)

    gribapi.grib_set_long(grib_message, "Nx", int(new_nx))
    gribapi.grib_set_double(grib_message,
                            "iDirectionIncrementInDegrees", float(new_x_step))
    gribapi.grib_set_double_array(grib_message, "values", new_values)
    gribapi.grib_set_long(grib_message, "jPointsAreConsecutive", 0)
    gribapi.grib_set_long(grib_message, "PLPresent", 0)

Example 79

Project: tia Source File: pl.py
Function: compute
    def compute(self, txns):
        """Compute the long/short live-to-date transaction level profit and loss. Uses an open average calculation"""
        txndata = txns.frame
        mktdata = txns.pricer.get_eod_frame()
        if not isinstance(mktdata.index, pd.DatetimeIndex):
            mktdata.to_timestamp(freq='B')

        # get the set of all txn dts and mkt data dts
        pl = pd.merge(txndata, mktdata.reset_index(), how='outer', on=TPL.DT)
        if pl[TC.PID].isnull().all():
            ltd_frame = pd.DataFrame(index=pl.index)
            ltd_frame[TPL.DT] = pl[PL.DT]
            ltd_frame[TPL.POS] = 0
            ltd_frame[TPL.PID] = 0
            ltd_frame[TPL.TID] = 0
            ltd_frame[TPL.TXN_QTY] = np.nan
            ltd_frame[TPL.TXN_PX] = np.nan
            ltd_frame[TPL.TXN_FEES] = 0
            ltd_frame[TPL.TXN_PREMIUM] = 0
            ltd_frame[TPL.TXN_INTENT] = 0
            ltd_frame[TPL.TXN_ACTION] = 0
            ltd_frame[TPL.CLOSE_PX] = pl[TPL.CLOSE_PX]
            ltd_frame[TPL.OPEN_VAL] = 0
            ltd_frame[TPL.MKT_VAL] = 0
            ltd_frame[TPL.TOT_VAL] = 0
            ltd_frame[TPL.DVDS] = 0
            ltd_frame[TPL.FEES] = 0
            ltd_frame[TPL.RPL_GROSS] = 0
            ltd_frame[TPL.RPL] = 0
            ltd_frame[TPL.UPL] = 0
            ltd_frame[TPL.PL] = 0
            return ltd_frame
        else:
            pl.sort([TC.DT, TC.PID, TC.TID], inplace=1)
            pl.reset_index(inplace=1, drop=1)
            # check that all days can be priced
            has_position = pl[TC.PID] > 0
            missing_pxs = pl[MC.CLOSE].isnull()
            missing = pl[TC.DT][has_position & missing_pxs]
            if len(missing) > 0:
                msg = 'insufficient price data: {0} prices missing for dates {1}'
                mdates = ','.join([_.strftime('%Y-%m-%d') for _ in set(missing[:5])])
                mdates += (len(missing) > 5 and '...' or '')
                raise Exception(msg.format(len(missing), mdates))

            # Now there is a row for every timestamp. Now compute the pl and fill in where missing data should be
            cols = [TC.DT, TC.POS, TC.PID, TC.TID, TC.INTENT, TC.ACTION, TC.FEES, TC.QTY, TC.PX, TC.PREMIUM,
                    TC.OPEN_VAL]
            dts, pos_qtys, pids, tids, intents, sides, txn_fees, txn_qtys, txn_pxs, premiums, open_vals = [pl[c] for c
                                                                                                           in
                                                                                                           cols]

            dvds, closing_pxs, mkt_vals = [pl[c] for c in [MC.DVDS, MC.CLOSE, MC.MKT_VAL]]
            # Ensure only end of day is kept for dividends (join will match dvd to any transaction during day
            dvds = dvds.where(dts != dts.shift(-1), 0)
            # fill in pl dates
            open_vals.ffill(inplace=1)
            open_vals.fillna(0, inplace=1)
            pos_qtys.ffill(inplace=1)
            pos_qtys.fillna(0, inplace=1)
            # pid is the only tricky one, copy only while position is open
            inpos = intents.notnull() | (pos_qtys != 0)
            pids = np.where(inpos, pids.ffill(), 0)
            pl['pid'] = pids.astype(int)
            # Zero fill missing
            dvds.fillna(0, inplace=1)
            tids.fillna(0, inplace=1)
            tids = tids.astype(int)
            intents.fillna(0, inplace=1)
            intents = intents.astype(int)
            sides.fillna(0, inplace=1)
            sides = sides.astype(int)
            txn_fees.fillna(0, inplace=1)
            premiums.fillna(0, inplace=1)
            # LTD p/l calculation
            fees = txn_fees.cuemsum()
            total_vals = premiums.cuemsum()
            mkt_vals *= pos_qtys
            dvds = (dvds * pos_qtys).cuemsum()
            rpl_gross = total_vals - open_vals
            rpl = rpl_gross + fees + dvds
            upl = mkt_vals + open_vals
            tpl = upl + rpl
            # build the result
            data = OrderedDict()
            data[TPL.DT] = dts
            data[TPL.POS] = pos_qtys
            data[TPL.PID] = pids
            data[TPL.TID] = tids
            data[TPL.TXN_QTY] = txn_qtys
            data[TPL.TXN_PX] = txn_pxs
            data[TPL.TXN_FEES] = txn_fees
            data[TPL.TXN_PREMIUM] = premiums
            data[TPL.TXN_INTENT] = intents
            data[TPL.TXN_ACTION] = sides
            data[TPL.CLOSE_PX] = closing_pxs
            data[TPL.OPEN_VAL] = open_vals
            data[TPL.MKT_VAL] = mkt_vals
            data[TPL.TOT_VAL] = total_vals
            data[TPL.DVDS] = dvds
            data[TPL.FEES] = fees
            data[TPL.RPL_GROSS] = rpl_gross
            data[TPL.RPL] = rpl
            data[TPL.UPL] = upl
            data[TPL.PL] = tpl
            ltd_frame = pd.DataFrame(data, columns=data.keys())
            return ltd_frame

Example 80

Project: zipline Source File: cumulative.py
    def __init__(self, sim_params, treasury_curves, trading_calendar,
                 create_first_day_stats=False):
        self.treasury_curves = treasury_curves
        self.trading_calendar = trading_calendar
        self.start_session = sim_params.start_session
        self.end_session = sim_params.end_session

        self.sessions = trading_calendar.sessions_in_range(
            self.start_session, self.end_session
        )

        # Hold on to the trading day before the start,
        # used for index of the zero return value when forcing returns
        # on the first day.
        self.day_before_start = self.start_session - self.sessions.freq

        last_day = normalize_date(sim_params.end_session)
        if last_day not in self.sessions:
            last_day = pd.tseries.index.DatetimeIndex(
                [last_day]
            )
            self.sessions = self.sessions.append(last_day)

        self.sim_params = sim_params

        self.create_first_day_stats = create_first_day_stats

        cont_index = self.sessions

        self.cont_index = cont_index
        self.cont_len = len(self.cont_index)

        empty_cont = np.full(self.cont_len, np.nan)

        self.algorithm_returns_cont = empty_cont.copy()
        self.benchmark_returns_cont = empty_cont.copy()
        self.algorithm_cuemulative_leverages_cont = empty_cont.copy()
        self.mean_returns_cont = empty_cont.copy()
        self.annualized_mean_returns_cont = empty_cont.copy()
        self.mean_benchmark_returns_cont = empty_cont.copy()
        self.annualized_mean_benchmark_returns_cont = empty_cont.copy()

        # The returns at a given time are read and reset from the respective
        # returns container.
        self.algorithm_returns = None
        self.benchmark_returns = None
        self.mean_returns = None
        self.annualized_mean_returns = None
        self.mean_benchmark_returns = None
        self.annualized_mean_benchmark_returns = None

        self.algorithm_cuemulative_returns = empty_cont.copy()
        self.benchmark_cuemulative_returns = empty_cont.copy()
        self.algorithm_cuemulative_leverages = empty_cont.copy()
        self.excess_returns = empty_cont.copy()

        self.latest_dt_loc = 0
        self.latest_dt = cont_index[0]

        self.benchmark_volatility = empty_cont.copy()
        self.algorithm_volatility = empty_cont.copy()
        self.beta = empty_cont.copy()
        self.alpha = empty_cont.copy()
        self.sharpe = empty_cont.copy()
        self.downside_risk = empty_cont.copy()
        self.sortino = empty_cont.copy()
        self.information = empty_cont.copy()

        self.drawdowns = empty_cont.copy()
        self.max_drawdowns = empty_cont.copy()
        self.max_drawdown = 0
        self.max_leverages = empty_cont.copy()
        self.max_leverage = 0
        self.current_max = -np.inf
        self.daily_treasury = pd.Series(index=self.sessions)
        self.treasury_period_return = np.nan

        self.num_trading_days = 0

Example 81

Project: tia Source File: ta.py
def cross_signal(s1, s2, continuous=0):
    """ return a signal with the following
    1 : when all values of s1 cross all values of s2
    -1 : when all values of s2 cross below all values of s2
    0 : if s1 < max(s2) and s1 > min(s2)
    np.nan : if s1 or s2 contains np.nan at position


    s1: Series, DataFrame, float, int, or tuple(float|int)
    s2: Series, DataFrame, float, int, or tuple(float|int)
    continous: bool, if true then once the signal starts it is always 1 or -1
    """

    def _convert(src, other):
        if isinstance(src, pd.DataFrame):
            return src.min(axis=1, skipna=0), src.max(axis=1, skipna=0)
        elif isinstance(src, pd.Series):
            return src, src
        elif isinstance(src, (int, float)):
            s = pd.Series(src, index=other.index)
            return s, s
        elif isinstance(src, (tuple, list)):
            l, u = min(src), max(src)
            assert l <= u, 'lower bound must be less than upper bound'
            lower, upper = pd.Series(l, index=other.index), pd.Series(u, index=other.index)
            return lower, upper
        else:
            raise Exception('unable to handle type %s' % type(src))

    lower1, upper1 = _convert(s1, s2)
    lower2, upper2 = _convert(s2, s1)

    df = pd.DataFrame({'upper1': upper1, 'lower1': lower1, 'upper2': upper2, 'lower2': lower2})
    df.ffill(inplace=True)

    signal = pd.Series(np.nan, index=df.index)
    signal[df.upper1 > df.upper2] = 1
    signal[df.lower1 < df.lower2] = -1

    if continuous:
        # Just roll with 1, -1
        signal = signal.fillna(method='ffill')
        m1, m2 = df.upper1.first_valid_index(), df.upper2.first_valid_index()
        if m1 is not None or m2 is not None:
            m1 = m2 if m1 is None else m1
            m2 = m1 if m2 is None else m2
            fv = max(m1, m2)
            if np.isnan(signal[fv]):
                signal[fv] = 0
                signal.ffill(inplace=1)
    else:
        signal[(df.upper1 < df.upper2) & (df.lower1 > df.lower2)] = 0
        # special handling when equal, determine where it previously was
        eq = (df.upper1 == df.upper2)
        if eq.any():  # Set to prior value
            tmp = signal[eq]
            for i in tmp.index:
                loc = signal.index.get_loc(i)
                if loc != 0:
                    u, l = df.upper2.iloc[loc], df.lower2.iloc[loc]
                    ps = signal.iloc[loc - 1]
                    if u == l or ps == 1.:  # Line coming from above upper bound if ps == 1
                        signal[i] = ps
                    else:
                        signal[i] = 0

        eq = (df.lower1 == df.lower2)
        if eq.any():  # Set to prior value
            tmp = signal[eq]
            for i in tmp.index:
                loc = signal.index.get_loc(i)
                if loc != 0:
                    u, l = df.upper2.iloc[loc], df.lower2.iloc[loc]
                    ps = signal.iloc[loc - 1]
                    if u == l or ps == -1.:  # Line coming from below lower bound if ps == -1
                        signal[i] = ps
                    else:
                        signal[i] = 0

    return signal

Example 82

Project: simpeg Source File: ediFilesUtils.py
    def importFiles(self):
        """
        Function to import EDI files into a object.


        """

        # Constants that are needed for convertion of units

        # Temp lists
        tmpStaList = []

        tmpCompList = ['freq','x','y','z']
        tmpCompList.extend(self.comps)
        # Make the outarray
        dtRI = [(compS.lower().replace('.',''),float) for compS in tmpCompList]
        # Loop through all the files
        for nrEDI, EDIfile in enumerate(self.filesList):
            # Read the file into a list of the lines
            with open(EDIfile,'r') as fid:
                EDIlines = fid.readlines()
            # Find the location
            latD, longD, elevM = _findLatLong(EDIlines)
            # Transfrom coordinates
            transCoord = self._transfromPoints(longD,latD)
            # Extract the name of the file (station)
            EDIname = EDIfile.split(os.sep)[-1].split('.')[0]
            # Arrange the data
            staList = [EDIname, EDIfile, transCoord[0], transCoord[1], elevM[0]]
            # Add to the station list
            tmpStaList.extend(staList)

            # Read the frequency data
            freq = _findEDIcomp('>FREQ',EDIlines)
            # Make the temporary rec array.
            tArrRec = ( np.nan*np.ones( (len(freq),len(dtRI)) ) ).view(dtRI)     #np.concatenate((freq*np.ones((locs.shape[0],1)),locs,np.nan*np.ones((locs.shape[0],8))),axis=1).view(dtRI)
            # Add data to the array
            tArrRec['freq'] = mkvc(freq,2)
            tArrRec['x'] = mkvc(np.ones((len(freq),1))*transCoord[0],2)
            tArrRec['y'] = mkvc(np.ones((len(freq),1))*transCoord[1],2)
            tArrRec['z'] = mkvc(np.ones((len(freq),1))*elevM[0],2)
            for comp in self.comps:
                # Deal with converting units of the impedance tensor
                if 'Z' in comp:
                    unitConvert = self._impUnitEDI2SI
                else:
                    unitConvert = 1
                # Rotate the data since EDI x is *north, y *east but Simpeg uses x *east, y *north (* means internal reference frame)
                key = [comp.lower().replace('.','').replace(s,t) for s,t in [['xx','yy'],['xy','yx'],['yx','xy'],['yy','xx']] if s in comp.lower()][0]
                tArrRec[key] = mkvc(unitConvert*_findEDIcomp('>'+comp,EDIlines),2)
            # Make a masked array
            mArrRec = np.ma.MaskedArray(rec_to_ndarr(tArrRec),mask=np.isnan(rec_to_ndarr(tArrRec))).view(dtype=tArrRec.dtype)
            try:
                outTemp = recFunc.stack_arrays((outTemp,mArrRec))
            except NameError:
                outTemp = mArrRec

        # Assign the data
        self._data = outTemp

Example 83

Project: AlephNull Source File: test_algorithms.py
Function: handle_data
    def handle_data(self, data):
        self.history_return_price_class.append(
            self.return_price_class.handle_data(data))
        self.history_return_price_decorator.append(
            self.return_price_decorator.handle_data(data))
        self.history_return_args.append(
            self.return_args_batch.handle_data(
                data, *self.args, **self.kwargs))
        self.history_return_not_full.append(
            self.return_not_full.handle_data(data))
        self.uses_ufunc.handle_data(data)

        # check that calling transforms with the same arguments
        # is idempotent
        self.price_multiple.handle_data(data, 1, extra_arg=1)

        if self.price_multiple.full:
            pre = self.price_multiple.rolling_panel.get_current().shape[0]
            result1 = self.price_multiple.handle_data(data, 1, extra_arg=1)
            post = self.price_multiple.rolling_panel.get_current().shape[0]
            assert pre == post, "batch transform is appending redundant events"
            result2 = self.price_multiple.handle_data(data, 1, extra_arg=1)
            assert result1 is result2, "batch transform is not idempotent"

            # check that calling transform with the same data, but
            # different supplemental arguments results in new
            # results.
            result3 = self.price_multiple.handle_data(data, 2, extra_arg=1)
            assert result1 is not result3, \
                "batch transform is not updating for new args"

            result4 = self.price_multiple.handle_data(data, 1, extra_arg=2)
            assert result1 is not result4,\
                "batch transform is not updating for new kwargs"

        new_data = deepcopy(data)
        for sid in new_data:
            new_data[sid]['arbitrary'] = 123

        self.history_return_arbitrary_fields.append(
            self.return_arbitrary_fields.handle_data(new_data))

        # nan every second event price
        if self.iter % 2 == 0:
            self.history_return_nan.append(
                self.return_nan.handle_data(data))
        else:
            nan_data = deepcopy(data)
            for sid in nan_data.iterkeys():
                nan_data[sid].price = np.nan
            self.history_return_nan.append(
                self.return_nan.handle_data(nan_data))

        self.iter += 1

        # Add a new sid to check that it does not get included
        extra_sid_data = deepcopy(data)
        extra_sid_data[1] = extra_sid_data[0]
        self.history_return_sid_filter.append(
            self.return_sid_filter.handle_data(extra_sid_data)
        )

        # Add a field to check that it does not get included
        extra_field_data = deepcopy(data)
        extra_field_data[0]['ignore'] = extra_sid_data[0]['price']
        self.history_return_field_filter.append(
            self.return_field_filter.handle_data(extra_field_data)
        )
        self.history_return_field_no_filter.append(
            self.return_field_no_filter.handle_data(extra_field_data)
        )

Example 84

Project: empyrical Source File: stats.py
Function: sortino_ratio
def sortino_ratio(returns, required_return=0, period=DAILY,
                  annualization=None, _downside_risk=None):
    """
    Determines the Sortino ratio of a strategy.

    Parameters
    ----------
    returns : pd.Series or np.ndarray or pd.DataFrame
        Daily returns of the strategy, noncuemulative.
        - See full explanation in :func:`~empyrical.stats.cuem_returns`.
    required_return: float / series
        minimum acceptable return
    period : str, optional
        Defines the periodicity of the 'returns' data for purposes of
        annualizing. Value ignored if `annualization` parameter is specified.
        Defaults are:
            'monthly':12
            'weekly': 52
            'daily': 252
    annualization : int, optional
        Used to suppress default values available in `period` to convert
        returns into annual returns. Value should be the annual frequency of
        `returns`.
    _downside_risk : float, optional
        The downside risk of the given inputs, if known. Will be calculated if
        not provided.

    Returns
    -------
    float, pd.Series

        depends on input type
        series ==> float
        DataFrame ==> pd.Series

        Annualized Sortino ratio.

    """

    if len(returns) < 2:
        return np.nan

    ann_factor = annualization_factor(period, annualization)

    adj_returns = _adjust_returns(returns, required_return)
    mu = nanmean(adj_returns, axis=0)
    dsr = (_downside_risk if _downside_risk is not None
           else downside_risk(returns, required_return))
    sortino = mu / dsr
    return sortino * ann_factor

Example 85

Project: cclib Source File: orcaparser.py
Function: extract
    def extract(self, inputfile, line):
        """Extract information from the file object inputfile."""

        #  extract the version number first
        if "Program Version" in line:
            self.metadata["package_version"] = line.split()[2]

        if line[0:15] == "Number of atoms":

            natom = int(line.split()[-1])
            self.set_attribute('natom', natom)

        if line[1:13] == "Total Charge":

            charge = int(line.split()[-1])
            self.set_attribute('charge', charge)

            line = next(inputfile)

            mult = int(line.split()[-1])
            self.set_attribute('mult', mult)

        # SCF convergence output begins with:
        #
        # --------------
        # SCF ITERATIONS
        # --------------
        #
        # However, there are two common formats which need to be handled, implemented as separate functions.
        if "SCF ITERATIONS" in line:

            self.skip_line(inputfile, 'dashes')

            line = next(inputfile)
            columns = line.split()
            # "Starting incremental Fock matrix formation" doesn't
            # necessarily appear before the extended format.
            if not columns:
                self.parse_scf_expanded_format(inputfile, columns)
            # A header with distinct columns indicates the condensed
            # format.
            elif columns[1] == "Energy":
                self.parse_scf_condensed_format(inputfile, columns)
            # Assume the extended format.
            else:
                self.parse_scf_expanded_format(inputfile, columns)

        # Information about the final iteration, which also includes the convergence
        # targets and the convergence values, is printed separately, in a section like this:
        #
        #       cuem*************************************************
        #       *                     SUCCESS                       *
        #       *           SCF CONVERGED AFTER   9 CYCLES          *
        #       *****************************************************
        #
        # ...
        #
        # Total Energy       :         -382.04963064 Eh          -10396.09898 eV
        #
        # ...
        #
        # -------------------------   ----------------
        # FINAL SINGLE POINT ENERGY     -382.049630637
        # -------------------------   ----------------
        #
        # We cannot use this last message as a stop condition in general, because
        # often there is vibrational output before it. So we use the 'Total Energy'
        # line. However, what comes after that is different for single point calculations
        # and in the inner steps of geometry optimizations.
        if "SCF CONVERGED AFTER" in line:

            if not hasattr(self, "scfenergies"):
                self.scfenergies = []
            if not hasattr(self, "scfvalues"):
                self.scfvalues = []
            if not hasattr(self, "scftargets"):
                self.scftargets = []

            while not "Total Energy       :" in line:
                line = next(inputfile)
            energy = utils.convertor(float(line.split()[3]), "hartree", "eV")
            self.scfenergies.append(energy)

            self._append_scfvalues_scftargets(inputfile, line)

        # Sometimes the SCF does not converge, but does not halt the
        # the run (like in bug 3184890). In this this case, we should
        # remain consistent and use the energy from the last reported
        # SCF cycle. In this case, ORCA print a banner like this:
        #
        #       *****************************************************
        #       *                     ERROR                         *
        #       *           SCF NOT CONVERGED AFTER   8 CYCLES      *
        #       *****************************************************
        if "SCF NOT CONVERGED AFTER" in line:

            if not hasattr(self, "scfenergies"):
                self.scfenergies = []
            if not hasattr(self, "scfvalues"):
                self.scfvalues = []
            if not hasattr(self, "scftargets"):
                self.scftargets = []

            energy = utils.convertor(self.scfvalues[-1][-1][0], "hartree", "eV")
            self.scfenergies.append(energy)

            self._append_scfvalues_scftargets(inputfile, line)

        # The convergence targets for geometry optimizations are printed at the
        # beginning of the output, although the order and their description is
        # different than later on. So, try to standardize the names of the criteria
        # and save them for later so that we can get the order right.
        #
        #                        *****************************
        #                        * Geometry Optimization Run *
        #                        *****************************
        #
        # Geometry optimization settings:
        # Update method            Update   .... BFGS
        # Choice of coordinates    CoordSys .... Redundant Internals
        # Initial Hessian          InHess   .... Almoef's Model
        #
        # Convergence Tolerances:
        # Energy Change            TolE     ....  5.0000e-06 Eh
        # Max. Gradient            TolMAXG  ....  3.0000e-04 Eh/bohr
        # RMS Gradient             TolRMSG  ....  1.0000e-04 Eh/bohr
        # Max. Displacement        TolMAXD  ....  4.0000e-03 bohr
        # RMS Displacement         TolRMSD  ....  2.0000e-03 bohr
        #
        if line[25:50] == "Geometry Optimization Run":

            stars = next(inputfile)
            blank = next(inputfile)

            line = next(inputfile)
            while line[0:23] != "Convergence Tolerances:":
                line = next(inputfile)

            if hasattr(self, 'geotargets'):
                self.logger.warning('The geotargets attribute should not exist yet. There is a problem in the parser.')
            self.geotargets = []
            self.geotargets_names = []

            # There should always be five tolerance values printed here.
            for i in range(5):
                line = next(inputfile)
                name = line[:25].strip().lower().replace('.', '').replace('displacement', 'step')
                target = float(line.split()[-2])
                self.geotargets_names.append(name)
                self.geotargets.append(target)

        # The convergence targets for relaxed surface scan steps are printed at the
        # beginning of the output, although the order and their description is
        # different than later on. So, try to standardize the names of the criteria
        # and save them for later so that we can get the order right.
        #
        #         *************************************************************
        #         *               RELAXED SURFACE SCAN STEP  12               *
        #         *                                                           *
        #         *   Dihedral ( 11,  10,   3,   4)  : 180.00000000           *
        #         *************************************************************
        #
        # Geometry optimization settings:
        # Update method            Update   .... BFGS
        # Choice of coordinates    CoordSys .... Redundant Internals
        # Initial Hessian          InHess   .... Almoef's Model
        #
        # Convergence Tolerances:
        # Energy Change            TolE     ....  5.0000e-06 Eh
        # Max. Gradient            TolMAXG  ....  3.0000e-04 Eh/bohr
        # RMS Gradient             TolRMSG  ....  1.0000e-04 Eh/bohr
        # Max. Displacement        TolMAXD  ....  4.0000e-03 bohr
        # RMS Displacement         TolRMSD  ....  2.0000e-03 bohr
        if line[25:50] == "RELAXED SURFACE SCAN STEP":

            self.is_relaxed_scan = True
            blank = next(inputfile)
            info = next(inputfile)
            stars = next(inputfile)
            blank = next(inputfile)

            line = next(inputfile)
            while line[0:23] != "Convergence Tolerances:":
                line = next(inputfile)

            self.geotargets = []
            self.geotargets_names = []

            # There should always be five tolerance values printed here.
            for i in range(5):
                line = next(inputfile)
                name = line[:25].strip().lower().replace('.', '').replace('displacement', 'step')
                target = float(line.split()[-2])
                self.geotargets_names.append(name)
                self.geotargets.append(target)

        # After each geometry optimization step, ORCA prints the current convergence
        # parameters and the targets (again), so it is a good idea to check that they
        # have not changed. Note that the order of these criteria here are different
        # than at the beginning of the output, so make use of the geotargets_names created
        # before and save the new geovalues in correct order.
        #
        #          ----------------------|Geometry convergence|---------------------
        #          Item                value                 Tolerance   Converged
        #          -----------------------------------------------------------------
        #          Energy change       0.00006021            0.00000500      NO
        #          RMS gradient        0.00031313            0.00010000      NO
        #          RMS step            0.01596159            0.00200000      NO
        #          MAX step            0.04324586            0.00400000      NO
        #          ....................................................
        #          Max(Bonds)      0.0218      Max(Angles)    2.48
        #          Max(Dihed)        0.00      Max(Improp)    0.00
        #          -----------------------------------------------------------------
        #
        if line[33:53] == "Geometry convergence":

            if not hasattr(self, "geovalues"):
                self.geovalues = []

            headers = next(inputfile)
            dashes = next(inputfile)

            names = []
            values = []
            targets = []
            line = next(inputfile)
            while list(set(line.strip())) != ["."]:
                name = line[10:28].strip().lower()
                value = float(line.split()[2])
                target = float(line.split()[3])
                names.append(name)
                values.append(value)
                targets.append(target)
                line = next(inputfile)

            # The energy change is normally not printed in the first iteration, because
            # there was no previous energy -- in that case assume zero. There are also some
            # edge cases where the energy change is not printed, for example when internal
            # angles become improper and internal coordinates are rebuilt as in regression
            # CuI-MePY2-CH3CN_optxes, and in such cases use NaN.
            newvalues = []
            for i, n in enumerate(self.geotargets_names):
                if (n == "energy change") and (n not in names):
                    if self.is_relaxed_scan:
                      newvalues.append(0.0)
                    else:
                      newvalues.append(numpy.nan)
                else:
                    newvalues.append(values[names.index(n)])
                    assert targets[names.index(n)] == self.geotargets[i]

            self.geovalues.append(newvalues)

        #if not an optimization, determine structure used
        if line[0:21] == "CARTESIAN COORDINATES" and not hasattr(self, "atomcoords"):

            self.skip_line(inputfile, 'dashes')

            atomnos = []
            atomcoords = []
            line = next(inputfile)
            while len(line) > 1:
                broken = line.split()
                atomnos.append(self.table.number[broken[0]])
                atomcoords.append(list(map(float, broken[1:4])))
                line = next(inputfile)

            self.set_attribute('natom', len(atomnos))
            self.set_attribute('atomnos', atomnos)

            self.atomcoords = [atomcoords]

        # There's always a banner announcing the next geometry optimization cycle,
        # which looks something like this:
        #
        #    *************************************************************
        #    *                GEOMETRY OPTIMIZATION CYCLE   2            *
        #    *************************************************************
        if "GEOMETRY OPTIMIZATION CYCLE" in line:

            # Keep track of the current cycle jsut in case, because some things
            # are printed differently inside the first/last and other cycles.
            self.gopt_cycle = int(line.split()[4])

            self.skip_lines(inputfile, ['s', 'd', 'text', 'd'])

            if not hasattr(self, "atomcoords"):
                self.atomcoords = []

            atomnos = []
            atomcoords = []
            for i in range(self.natom):
                line = next(inputfile)
                broken = line.split()
                atomnos.append(self.table.number[broken[0]])
                atomcoords.append(list(map(float, broken[1:4])))

            self.atomcoords.append(atomcoords)

            self.set_attribute('atomnos', atomnos)

        if line[21:68] == "FINAL ENERGY EVALUATION AT THE STATIONARY POINT":

            if not hasattr(self, 'optdone'):
                self.optdone = []
            self.optdone.append(len(self.atomcoords))

            self.skip_lines(inputfile, ['text', 's', 'd', 'text', 'd'])

            atomcoords = []
            for i in range(self.natom):
                line = next(inputfile)
                broken = line.split()
                atomcoords.append(list(map(float, broken[1:4])))

            self.atomcoords.append(atomcoords)

        if "The optimization did not converge" in line:
            if not hasattr(self, 'optdone'):
                self.optdone = []

        if line[0:16] == "ORBITAL ENERGIES":

            self.skip_lines(inputfile, ['d', 'text', 'text'])

            self.mooccnos = [[]]
            self.moenergies = [[]]

            line = next(inputfile)
            while len(line) > 20:  # restricted calcs are terminated by ------
                info = line.split()
                mooccno = int(float(info[1]))
                moenergy = float(info[2])
                self.mooccnos[0].append(mooccno)
                self.moenergies[0].append(utils.convertor(moenergy, "hartree", "eV"))
                line = next(inputfile)

            line = next(inputfile)

            # handle beta orbitals for UHF
            if line[17:35] == "SPIN DOWN ORBITALS":
                text = next(inputfile)

                self.mooccnos.append([])
                self.moenergies.append([])

                line = next(inputfile)
                while len(line) > 20:  # actually terminated by ------
                    info = line.split()
                    mooccno = int(float(info[1]))
                    moenergy = float(info[2])
                    self.mooccnos[1].append(mooccno)
                    self.moenergies[1].append(utils.convertor(moenergy, "hartree", "eV"))
                    line = next(inputfile)

            if not hasattr(self, 'humos'):
                doubly_occupied = self.mooccnos[0].count(2)
                singly_occupied = self.mooccnos[0].count(1)
                # Restricted closed-shell.
                if doubly_occupied > 0 and singly_occupied == 0:
                    self.set_attribute('humos', [doubly_occupied - 1])
                # Restricted open-shell.
                elif doubly_occupied > 0 and singly_occupied > 0:
                    self.set_attribute('humos', [doubly_occupied + singly_occupied - 1,
                                                 doubly_occupied - 1])
                # Unrestricted.
                else:
                    assert len(self.moenergies) == 2
                    assert doubly_occupied == 0
                    assert self.mooccnos[1].count(2) == 0
                    nbeta = self.mooccnos[1].count(1)
                    self.set_attribute('humos', [singly_occupied - 1, nbeta - 1])

        # So nbasis was parsed at first with the first pattern, but it turns out that
        # semiempirical methods (at least AM1 as reported by Julien Idé) do not use this.
        # For this reason, also check for the second patterns, and use it as an assert
        # if nbasis was already parsed. Regression PCB_1_122.out covers this test case.
        if line[1:32] == "# of contracted basis functions":
            self.set_attribute('nbasis', int(line.split()[-1]))
        if line[1:27] == "Basis Dimension        Dim":
            self.set_attribute('nbasis', int(line.split()[-1]))

        if line[0:14] == "OVERLAP MATRIX":

            self.skip_line(inputfile, 'dashes')

            self.aooverlaps = numpy.zeros((self.nbasis, self.nbasis), "d")
            for i in range(0, self.nbasis, 6):
                self.updateprogress(inputfile, "Overlap")

                header = next(inputfile)
                size = len(header.split())

                for j in range(self.nbasis):
                    line = next(inputfile)
                    broken = line.split()
                    self.aooverlaps[j, i:i+size] = list(map(float, broken[1:size+1]))

        # Molecular orbital coefficients.
        # This is also where atombasis is parsed.
        if line[0:18] == "MOLECULAR ORBITALS":

            self.skip_line(inputfile, 'dashes')

            mocoeffs = [numpy.zeros((self.nbasis, self.nbasis), "d")]
            self.aonames = []
            self.atombasis = []
            for n in range(self.natom):
                self.atombasis.append([])

            for spin in range(len(self.moenergies)):

                if spin == 1:
                    self.skip_line(inputfile, 'blank')
                    mocoeffs.append(numpy.zeros((self.nbasis, self.nbasis), "d"))

                for i in range(0, self.nbasis, 6):

                    self.updateprogress(inputfile, "Coefficients")

                    self.skip_lines(inputfile, ['numbers', 'energies', 'occs'])

                    dashes = next(inputfile)
                    broken = dashes.split()
                    size = len(broken)

                    for j in range(self.nbasis):
                        line = next(inputfile)
                        broken = line.split()

                        #only need this on the first time through
                        if spin == 0 and i == 0:
                            atomname = line[3:5].split()[0]
                            num = int(line[0:3])
                            orbital = broken[1].upper()

                            self.aonames.append("%s%i_%s" % (atomname, num+1, orbital))
                            self.atombasis[num].append(j)

                        temp = []
                        vals = line[16:-1]  # -1 to remove the last blank space
                        for k in range(0, len(vals), 10):
                            temp.append(float(vals[k:k+10]))
                        mocoeffs[spin][i:i+size, j] = temp

            self.mocoeffs = mocoeffs

        # Basis set information
        # ORCA prints this out in a somewhat indirect fashion.
        # Therefore, parsing occurs in several steps:
        # 1. read which atom belongs to which basis set group
        if line[0:21] == "BASIS SET INFORMATION":
            line = next(inputfile)
            line = next(inputfile)

            self.tmp_atnames = [] # temporary attribute, needed later
            while(not line[0:5] == '-----'):
                if line[0:4] == "Atom":
                    self.tmp_atnames.append(line[8:12].strip())
                line = next(inputfile)

        # 2. Read information for the basis set groups
        if line[0:25] == "BASIS SET IN INPUT FORMAT":
            line = next(inputfile)
            line = next(inputfile)

            # loop over basis set groups
            gbasis_tmp = {}
            while(not line[0:5] == '-----'):
                if line[1:7] == 'NewGTO':
                    bas_atname = line.split()[1]
                    gbasis_tmp[bas_atname] = []

                    line = next(inputfile)
                    # loop over contracted GTOs
                    while(not line[0:6] == '  end;'):
                        words = line.split()
                        ang = words[0]
                        nprim = int(words[1])

                        # loop over primitives
                        coeff = []
                        for iprim in range(nprim):
                            words = next(inputfile).split()
                            coeff.append( (float(words[1]), float(words[2])) )
                        gbasis_tmp[bas_atname].append((ang, coeff))

                        line = next(inputfile)
                line = next(inputfile)

            # 3. Assign the basis sets to gbasis
            self.gbasis = []
            for bas_atname in self.tmp_atnames:
                self.gbasis.append(gbasis_tmp[bas_atname])
            del self.tmp_atnames

        # Read TDDFT information
        if line[0:18] == "TD-DFT/TDA EXCITED":
            # Could be singlets or triplets
            if line.find("SINGLETS") >= 0:
                sym = "Singlet"
            elif line.find("TRIPLETS") >= 0:
                sym = "Triplet"
            else:
                sym = "Not specified"

            if not hasattr(self, "etenergies"):
                self.etsecs = []
                self.etenergies = []
                self.etsyms = []

            lookup = {'a': 0, 'b': 1}
            line = next(inputfile)
            while line.find("STATE") < 0:
                line = next(inputfile)
            # Contains STATE or is blank
            while line.find("STATE") >= 0:
                broken = line.split()
                self.etenergies.append(float(broken[-2]))
                self.etsyms.append(sym)
                line = next(inputfile)
                sec = []
                # Contains SEC or is blank
                while line.strip():
                    start = line[0:8].strip()
                    start = (int(start[:-1]), lookup[start[-1]])
                    end = line[10:17].strip()
                    end = (int(end[:-1]), lookup[end[-1]])
                    contrib = float(line[35:47].strip())
                    sec.append([start, end, contrib])
                    line = next(inputfile)
                self.etsecs.append(sec)
                line = next(inputfile)

        # This will parse etoscs for TD calculations, but note that ORCA generally
        # prints two sets, one based on the length form of transition dipole moments,
        # the other based on the velocity form. Although these should be identical
        # in the basis set limit, in practice they are rarely the same. Here we will
        # effectively parse just the spectrum based on the length-form.
        if (line[25:44] == "ABSORPTION SPECTRUM" or line[9:28] == "ABSORPTION SPECTRUM") and not hasattr(self, "etoscs"):

            self.skip_lines(inputfile, ['d', 'header', 'header', 'd'])

            self.etoscs = []
            for x in self.etsyms:
                osc = next(inputfile).split()[3]
                if osc == "spin":  # "spin forbidden"
                    osc = 0
                else:
                    osc = float(osc)
                self.etoscs.append(osc)

        if line[0:23] == "VIBRATIONAL FREQUENCIES":

            self.skip_lines(inputfile, ['d', 'b'])

            self.vibfreqs = numpy.zeros((3 * self.natom,), "d")

            for i in range(3 * self.natom):
                line = next(inputfile)
                self.vibfreqs[i] = float(line.split()[1])

            if numpy.any(self.vibfreqs[0:6] != 0):
                msg = "Modes corresponding to rotations/translations "
                msg += "may be non-zero."
                self.logger.warning(msg)

            self.vibfreqs = self.vibfreqs[6:]

        if line[0:12] == "NORMAL MODES":
            """ Format:
            NORMAL MODES
            ------------

            These modes are the cartesian displacements weighted by the diagonal matrix
            M(i,i)=1/sqrt(m[i]) where m[i] is the mass of the displaced atom
            Thus, these vectors are normalized but *not* orthogonal

                              0          1          2          3          4          5
                  0       0.000000   0.000000   0.000000   0.000000   0.000000   0.000000
                  1       0.000000   0.000000   0.000000   0.000000   0.000000   0.000000
                  2       0.000000   0.000000   0.000000   0.000000   0.000000   0.000000
            ...
            """

            self.vibdisps = numpy.zeros((3 * self.natom, self.natom, 3), "d")

            self.skip_lines(inputfile, ['d', 'b', 'text', 'text', 'text', 'b'])

            for mode in range(0, 3 * self.natom, 6):
                header = next(inputfile)
                for atom in range(self.natom):
                    x = next(inputfile).split()[1:]
                    y = next(inputfile).split()[1:]
                    z = next(inputfile).split()[1:]

                    self.vibdisps[mode:mode + 6, atom, 0] = x
                    self.vibdisps[mode:mode + 6, atom, 1] = y
                    self.vibdisps[mode:mode + 6, atom, 2] = z

            self.vibdisps = self.vibdisps[6:]

        if line[0:11] == "IR SPECTRUM":

            self.skip_lines(inputfile, ['d', 'b', 'header', 'd'])

            self.vibirs = numpy.zeros((3 * self.natom,), "d")

            line = next(inputfile)
            while len(line) > 2:
                num = int(line[0:4])
                self.vibirs[num] = float(line.split()[2])
                line = next(inputfile)

            self.vibirs = self.vibirs[6:]

        if line[0:14] == "RAMAN SPECTRUM":

            self.skip_lines(inputfile, ['d', 'b', 'header', 'd'])

            self.vibramans = numpy.zeros((3 * self.natom,), "d")

            line = next(inputfile)
            while len(line) > 2:
                num = int(line[0:4])
                self.vibramans[num] = float(line.split()[2])
                line = next(inputfile)

            self.vibramans = self.vibramans[6:]

        # ORCA will print atomic charges along with the spin populations,
        #   so care must be taken about choosing the proper column.
        # Population analyses are performed usually only at the end
        #   of a geometry optimization or other run, so we want to
        #   leave just the final atom charges.
        # Here is an example for Mulliken charges:
        # --------------------------------------------
        # MULLIKEN ATOMIC CHARGES AND SPIN POPULATIONS
        # --------------------------------------------
        #    0 H :    0.126447    0.002622
        #    1 C :   -0.613018   -0.029484
        #    2 H :    0.189146    0.015452
        #    3 H :    0.320041    0.037434
        # ...
        # Sum of atomic charges         :   -0.0000000
        # Sum of atomic spin populations:    1.0000000
        if line[:23] == "MULLIKEN ATOMIC CHARGES":

            has_spins = "AND SPIN POPULATIONS" in line

            if not hasattr(self, "atomcharges"):
                self.atomcharges = {}
            if has_spins and not hasattr(self, "atomspins"):
                self.atomspins = {}

            self.skip_line(inputfile, 'dashes')

            charges = []
            if has_spins:
                spins = []
            line = next(inputfile)
            while line[:21] != "Sum of atomic charges":
                charges.append(float(line[8:20]))
                if has_spins:
                    spins.append(float(line[20:]))
                line = next(inputfile)
            self.atomcharges["mulliken"] = charges
            if has_spins:
                self.atomspins["mulliken"] = spins

        # Things are the same for Lowdin populations, except that the sums
        #   are not printed (there is a blank line at the end).
        if line[:22] == "LOEWDIN ATOMIC CHARGES":

            has_spins = "AND SPIN POPULATIONS" in line

            if not hasattr(self, "atomcharges"):
                self.atomcharges = {}
            if has_spins and not hasattr(self, "atomspins"):
                self.atomspins = {}

            self.skip_line(inputfile, 'dashes')

            charges = []
            if has_spins:
                spins = []
            line = next(inputfile)
            while line.strip():
                charges.append(float(line[8:20]))
                if has_spins:
                    spins.append(float(line[20:]))
                line = next(inputfile)
            self.atomcharges["lowdin"] = charges
            if has_spins:
                self.atomspins["lowdin"] = spins

        # It is not stated explicitely, but the dipole moment components printed by ORCA
        # seem to be in atomic units, so they will need to be converted. Also, they
        # are most probably calculated with respect to the origin .
        #
        # -------------
        # DIPOLE MOMENT
        # -------------
        #                                 X             Y             Z
        # Electronic contribution:      0.00000      -0.00000      -0.00000
        # Nuclear contribution   :      0.00000       0.00000       0.00000
        #                         -----------------------------------------
        # Total Dipole Moment    :      0.00000      -0.00000      -0.00000
        #                         -----------------------------------------
        # Magnitude (a.u.)       :      0.00000
        # Magnitude (Debye)      :      0.00000
        #
        if line.strip() == "DIPOLE MOMENT":

            self.skip_lines(inputfile, ['d', 'XYZ', 'electronic', 'nuclear', 'd'])
            total = next(inputfile)
            assert "Total Dipole Moment" in total

            reference = [0.0, 0.0, 0.0]
            dipole = numpy.array([float(d) for d in total.split()[-3:]])
            dipole = utils.convertor(dipole, "ebohr", "Debye")

            if not hasattr(self, 'moments'):
                self.moments = [reference, dipole]
            else:
                try:
                    assert numpy.all(self.moments[1] == dipole)
                except AssertionError:
                    self.logger.warning('Overwriting previous multipole moments with new values')
                    self.moments = [reference, dipole]

        # Static polarizability.
        if line.strip() == "THE POLARIZABILITY TENSOR":
            if not hasattr(self, 'polarizabilities'):
                self.polarizabilities = []
            self.skip_lines(inputfile, ['d', 'b'])
            line = next(inputfile)
            assert line.strip() == "The raw cartesian tensor (atomic units):"
            polarizability = []
            for _ in range(3):
                line = next(inputfile)
                polarizability.append(line.split())
            self.polarizabilities.append(numpy.array(polarizability))

Example 86

Project: bayespy Source File: test_misc.py
Function: test_mean
    def test_mean(self):
        """
        Test the ceil division
        """

        self.assertAllClose(misc.mean(3),
                            3)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore", RuntimeWarning)
            self.assertAllClose(misc.mean(np.nan),
                                np.nan)

            self.assertAllClose(misc.mean([[2,3],
                                           [np.nan,np.nan]], 
                                           axis=-1),
                                [2.5,np.nan])
            self.assertAllClose(misc.mean([[2,3],
                                           [np.nan,np.nan]], 
                                           axis=-1,
                                           keepdims=True),
                                [[2.5],[np.nan]])
        
        self.assertAllClose(misc.mean([[2,3],
                                       [np.nan,np.nan]], 
                                       axis=-2),
                            [2,3])
        self.assertAllClose(misc.mean([[2,3],
                                       [np.nan,np.nan]], 
                                       axis=-2,
                                       keepdims=True),
                            [[2,3]])

        self.assertAllClose(misc.mean([[2,3],
                                       [np.nan,np.nan]]),
                            2.5)
        self.assertAllClose(misc.mean([[2,3],
                                       [np.nan,np.nan]],
                                       axis=(-1,-2)),
                            2.5)
        self.assertAllClose(misc.mean([[2,3],
                                       [np.nan,np.nan]], 
                                       keepdims=True),
                            [[2.5]])
        
        pass

Example 87

Project: cesium Source File: time_series.py
Function: init
    def __init__(self, t=None, m=None, e=None, target=None, meta_features={},
                 name=None, path=None, channel_names=None):
        """Create a `TimeSeries` object from measurement values/metadata.

        See `TimeSeries` docuementation for parameter values.
        """
        if t is None and m is None:
            raise ValueError("Either times or measurements must be provided.")
        elif m is None:
            m = _default_values_like(t, value=np.nan)

        # If m is 1-dimensional, so are t and e
        if _ndim(m) == 1:
            self.n_channels = 1
            if t is None:
                t = _default_values_like(m, upper=DEFAULT_MAX_TIME)
            if e is None:
                e = _default_values_like(m, value=DEFAULT_ERROR_VALUE)
        # If m is 2-dimensional, t and e could be 1d or 2d; default is 1d
        elif isinstance(m, np.ndarray) and m.ndim == 2:
            self.n_channels = len(m)
            if t is None:
                t = _default_values_like(m[0], upper=DEFAULT_MAX_TIME)
            if e is None:
                e = _default_values_like(m[0], value=DEFAULT_ERROR_VALUE)
        # If m is ragged (list of 1d arrays), t and e should also be ragged
        elif _ndim(m) == 2:
            self.n_channels = len(m)
            if t is None:
                t = _default_values_like(m, upper=DEFAULT_MAX_TIME)
            if e is None:
                e = _default_values_like(m, value=DEFAULT_ERROR_VALUE)
        else:
            raise ValueError("m must be a 1D or 2D array, or a 2D list of"
                             " arrays.")

        self.time = _make_array_if_possible(t)
        self.measurement = _make_array_if_possible(m)
        self.error = _make_array_if_possible(e)

        if _ndim(self.time) == 1 and _ndim(self.measurement) == 2:
            if isinstance(self.measurement, np.ndarray):
                self.time = np.broadcast_to(self.time, self.measurement.shape)
            else:
                raise ValueError("Times for each channel must be provided if m"
                                 " is a ragged array.")
                                 
        if _ndim(self.error) == 1 and _ndim(self.measurement) == 2:
            if isinstance(self.measurement, np.ndarray):
                self.error = np.broadcast_to(self.error, self.measurement.shape)
            else:
                raise ValueError("Errors for each channel must be provided if m"
                                 " is a ragged array.")
                                 
        if not (_compatible_shapes(self.measurement, self.time) and
                _compatible_shapes(self.measurement, self.error)):
            raise ValueError("times, values, errors are not of compatible"
                             " types/sizes. Please refer to the docstring"
                             " for list of allowed input types.")

        self.target = target
        self.meta_features = dict(meta_features)
        self.name = name
        self.path = path
        if channel_names is None:
            self.channel_names = ["channel_{}".format(i)
                                  for i in range(self.n_channels)]
        else:
            self.channel_names = channel_names

Example 88

Project: pysystemtrade Source File: correlations.py
    def __init__(self, data, log=logtoscreen("optimiser"), frequency="W", date_method="expanding",
                 rollyears=20,
                 dict_group=dict(), boring_offdiag=0.99, cleaning=True, **kwargs):
        """

        We generate a correlation from eithier a pd.DataFrame, or a list of them if we're pooling

        Its important that forward filling, or index / ffill / diff has been done before we begin

        :param data: Data to get correlations from
        :type data: pd.DataFrame or list if pooling

        :param frequency: Downsampling frequency. Must be "D", "W" or bigger
        :type frequency: str

        :param date_method: Method to pass to generate_fitting_dates
        :type date_method: str

        :param roll_years: If date_method is "rolling", number of years in window
        :type roll_years: int

        :param dict_group: dictionary of groupings; used to replace missing values
        :type dict_group: dict

        :param boring_offdiag: Value used in creating 'boring' matrix, for when no data
        :type boring_offdiag: float

        :param **kwargs: passed to correlation_single_period

        :returns: CorrelationList
        """

        cleaning = str2Bool(cleaning)

        # grouping dictionary, convert to faster, algo friendly, form
        group_dict = group_dict_from_natural(dict_group)

        data = df_from_list(data)
        column_names = list(data.columns)

        data = data.resample(frequency, how="last")

        # Generate time periods
        fit_dates = generate_fitting_dates(
            data, date_method=date_method, rollyears=rollyears)

        size = len(column_names)
        corr_with_no_data = boring_corr_matrix(size, offdiag=boring_offdiag)

        # create a list of correlation matrices
        corr_list = []

        log.terse("Correlation estimate")

        # Now for each time period, estimate correlation
        for fit_period in fit_dates:
            log.msg("Estimating from %s to %s" %
                    (fit_period.period_start, fit_period.period_end))

            if fit_period.no_data:
                # no data to fit with
                corr_with_nan = boring_corr_matrix(
                    size, offdiag=np.nan, diag=np.nan)
                corrmat = corr_with_nan

            else:

                data_for_estimate = data[
                    fit_period.fit_start:fit_period.fit_end]

                corrmat = correlation_single_period(data_for_estimate,
                                                    **kwargs)

            if cleaning:
                current_period_data = data[
                    fit_period.fit_start:fit_period.fit_end]
                must_haves = must_have_item(current_period_data)

                # means we can use earlier correlations with sensible values
                corrmat = clean_correlation(
                    corrmat, corr_with_no_data, must_haves)

            corr_list.append(corrmat)

        setattr(self, "corr_list", corr_list)
        setattr(self, "columns", column_names)
        setattr(self, "fit_dates", fit_dates)

Example 89

Project: mir_eval Source File: segment.py
Function: deviation
def deviation(reference_intervals, estimated_intervals, trim=False):
    """Compute the median deviations between reference
    and estimated boundary times.

    Examples
    --------
    >>> ref_intervals, _ = mir_eval.io.load_labeled_intervals('ref.lab')
    >>> est_intervals, _ = mir_eval.io.load_labeled_intervals('est.lab')
    >>> r_to_e, e_to_r = mir_eval.boundary.deviation(ref_intervals,
    ...                                              est_intervals)

    Parameters
    ----------
    reference_intervals : np.ndarray, shape=(n, 2)
        reference segment intervals, in the format returned by
        :func:`mir_eval.io.load_intervals` or
        :func:`mir_eval.io.load_labeled_intervals`.
    estimated_intervals : np.ndarray, shape=(m, 2)
        estimated segment intervals, in the format returned by
        :func:`mir_eval.io.load_intervals` or
        :func:`mir_eval.io.load_labeled_intervals`.
    trim : boolean
        if ``True``, the first and last intervals are ignored.
        Typically, these denote start (0.0) and end-of-track markers.
        (Default value = False)

    Returns
    -------
    reference_to_estimated : float
        median time from each reference boundary to the
        closest estimated boundary
    estimated_to_reference : float
        median time from each estimated boundary to the
        closest reference boundary

    """

    validate_boundary(reference_intervals, estimated_intervals, trim)

    # Convert intervals to boundaries
    reference_boundaries = util.intervals_to_boundaries(reference_intervals)
    estimated_boundaries = util.intervals_to_boundaries(estimated_intervals)

    # Suppress the first and last intervals
    if trim:
        reference_boundaries = reference_boundaries[1:-1]
        estimated_boundaries = estimated_boundaries[1:-1]

    # If we have no boundaries, we get no score.
    if len(reference_boundaries) == 0 or len(estimated_boundaries) == 0:
        return np.nan, np.nan

    dist = np.abs(np.subtract.outer(reference_boundaries,
                                    estimated_boundaries))

    estimated_to_reference = np.median(dist.min(axis=0))
    reference_to_estimated = np.median(dist.min(axis=1))

    return reference_to_estimated, estimated_to_reference

Example 90

Project: statsmodels Source File: results_discrete.py
    def mnlogit(self):
        """
        Results generated with
            anes_data = sm.datasets.anes96.load()
            anes_exog = anes_data.exog
            anes_exog = sm.add_constant(anes_exog, prepend=False)
            mlogit_mod = sm.MNLogit(anes_data.endog, anes_exog)

            alpha = 10 * np.ones((mlogit_mod.J - 1, mlogit_mod.K))
            alpha[-1,:] = 0
            mlogit_l1_res = mlogit_mod.fit_regularized(
            method='l1', alpha=alpha, trim_mode='auto', auto_trim_tol=0.02,
            acc=1e-10)
        """
        self.params = [[ 0.00100163, -0.05864195, -0.06147822, -0.04769671, -0.05222987,
            -0.09522432],
           [ 0.        ,  0.03186139,  0.12048999,  0.83211915,  0.92330292,
             1.5680646 ],
           [-0.0218185 , -0.01988066, -0.00808564, -0.00487463, -0.01400173,
            -0.00562079],
           [ 0.        ,  0.03306875,  0.        ,  0.02362861,  0.05486435,
             0.14656966],
           [ 0.        ,  0.04448213,  0.03252651,  0.07661761,  0.07265266,
             0.0967758 ],
           [ 0.90993803, -0.50081247, -2.08285102, -5.26132955, -4.86783179,
            -9.31537963]]
        self.conf_int = [[[ -0.0646223 ,   0.06662556],
            [         np.nan,          np.nan],
            [ -0.03405931,  -0.00957768],
            [         np.nan,          np.nan],
            [         np.nan,          np.nan],
            [  0.26697895,   1.55289711]],

           [[ -0.1337913 ,   0.01650741],
            [ -0.14477255,   0.20849532],
            [ -0.03500303,  -0.00475829],
            [ -0.11406121,   0.18019871],
            [  0.00479741,   0.08416684],
            [ -1.84626136,   0.84463642]],

           [[ -0.17237962,   0.04942317],
            [ -0.15146029,   0.39244026],
            [ -0.02947379,   0.01330252],
            [         np.nan,          np.nan],
            [ -0.02501483,   0.09006785],
            [ -3.90379391,  -0.26190812]],

           [[ -0.12938296,   0.03398954],
            [  0.62612955,   1.03810876],
            [ -0.02046322,   0.01071395],
            [ -0.13738534,   0.18464256],
            [  0.03017236,   0.12306286],
            [ -6.91227465,  -3.61038444]],

           [[ -0.12469773,   0.02023799],
            [  0.742564  ,   1.10404183],
            [ -0.02791975,  -0.00008371],
            [ -0.08491561,   0.19464431],
            [  0.0332926 ,   0.11201273],
            [ -6.29331126,  -3.44235233]],

           [[ -0.17165567,  -0.01879296],
            [  1.33994079,   1.79618841],
            [ -0.02027503,   0.00903345],
            [ -0.00267819,   0.29581751],
            [  0.05343135,   0.14012026],
            [-11.10419107,  -7.52656819]]]

        self.bse = [[ 0.03348221,  0.03834221,  0.05658338,  0.04167742,  0.03697408,
                 0.03899631],
               [        np.nan,  0.09012101,  0.13875269,  0.10509867,  0.09221543,
                 0.11639184],
               [ 0.00624543,  0.00771564,  0.01091253,  0.00795351,  0.00710116,
                 0.00747679],
               [        np.nan,  0.07506769,         np.nan,  0.08215148,  0.07131762,
                 0.07614826],
               [        np.nan,  0.02024768,  0.02935837,  0.02369699,  0.02008204,
                 0.02211492],
               [ 0.32804638,  0.68646613,  0.92906957,  0.84233441,  0.72729881,
                 0.91267567]]

        self.nnz_params = 32
        self.aic = 3019.4391360294126
        self.bic = 3174.6431733460686

Example 91

Project: finmarketpy Source File: backtestengine.py
    def calculate_leverage_factor(self, returns_df, vol_target, vol_max_leverage, vol_periods=60, vol_obs_in_year=252,
                                  vol_rebalance_freq='BM', data_resample_freq=None, data_resample_type='mean',
                                  returns=True, period_shift=0):
        """
        calculate_leverage_factor - Calculates the time series of leverage for a specified vol target

        Parameters
        ----------
        returns_df : DataFrame
            Asset returns

        vol_target : float
            vol target for assets

        vol_max_leverage : float
            maximum leverage allowed

        vol_periods : int
            number of periods to calculate volatility

        vol_obs_in_year : int
            number of observations in the year

        vol_rebalance_freq : str
            how often to rebalance

        vol_resample_type : str
            do we need to resample the underlying data first? (eg. have we got intraday data?)

        returns : boolean
            is this returns time series or prices?

        period_shift : int
            should we delay the signal by a number of periods?

        Returns
        -------
        pandas.Dataframe
        """

        calculations = Calculations()
        filter = Filter()

        if data_resample_freq is not None:
            return
            # TODO not implemented yet

        if not returns: returns_df = calculations.calculate_returns(returns_df)

        roll_vol_df = calculations.rolling_volatility(returns_df,
                                                      periods=vol_periods, obs_in_year=vol_obs_in_year).shift(
            period_shift)

        # calculate the leverage as function of vol target (with max lev constraint)
        lev_df = vol_target / roll_vol_df
        lev_df[lev_df > vol_max_leverage] = vol_max_leverage

        lev_df = filter.resample_time_series_frequency(lev_df, vol_rebalance_freq, data_resample_type)

        returns_df, lev_df = returns_df.align(lev_df, join='left', axis=0)

        lev_df = lev_df.fillna(method='ffill')
        lev_df.ix[0:vol_periods] = numpy.nan  # ignore the first elements before the vol window kicks in

        return lev_df

Example 92

Project: crab Source File: test_pairwise.py
def test_adjusted_cosine():
    """ Check that the pairwise Pearson distances computation"""
    #Idepontent Test
    X = [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0]]
    EFV = [[0.0, 0.0, 0.0, 0.0, 0.0, 0.0]]
    D = adjusted_cosine(X, X, EFV)
    assert_array_almost_equal(D, [[1.]])

    #Vector x Non Vector
    X = [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0]]
    Y = [[]]
    EFV = [[]]
    assert_raises(ValueError, adjusted_cosine, X, Y, EFV)

    #Vector A x Vector B
    X = [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0]]
    Y = [[3.0, 3.5, 1.5, 5.0, 3.5, 3.0]]
    EFV = [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0]]
    D = adjusted_cosine(X, Y, EFV)
    assert_array_almost_equal(D, [[0.80952381]])

    #Vector N x 1
    X = [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0], [2.5, 3.5, 3.0, 3.5, 2.5, 3.0]]
    Y = [[3.0, 3.5, 1.5, 5.0, 3.5, 3.0]]
    EFV = [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0]]
    D = adjusted_cosine(X, Y, EFV)
    assert_array_almost_equal(D, [[0.80952381], [0.80952381]])

    #N-Dimmensional Vectors
    X = [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0], [2.5, 3.5, 3.0, 3.5, 2.5, 3.0]]
    Y = [[3.0, 3.5, 1.5, 5.0, 3.5, 3.0], [2.5, 3.5, 3.0, 3.5, 2.5, 3.0]]
    EFV = [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0]]
    D = adjusted_cosine(X, Y, EFV)
    assert_array_almost_equal(D, [[0.80952381, 1.], [0.80952381, 1.]])

    X = [[2.5, 3.5, 3.0, 3.5, 2.5, 3.0], [3.0, 3.5, 1.5, 5.0, 3.5, 3.0]]
    EFV = [[2.0, 2.0, 2.0, 2.0, 2.0, 2.0]]
    D = adjusted_cosine(X, X, EFV)
    assert_array_almost_equal(D, [[1., 0.80952381], [0.80952381, 1.]])

    X = [[1.0, 0.0], [1.0, 1.0]]
    Y = [[0.0, 0.0]]
    EFV = [[0.0, 0.0]]
    D = adjusted_cosine(X, Y, EFV)
    assert_array_almost_equal(D, [[np.nan], [np.nan]])

    #Test Sparse Matrices
    X = csr_matrix(X)
    Y = csr_matrix(Y)
    EFV = csr_matrix(EFV)
    assert_raises(ValueError, adjusted_cosine, X, Y, EFV)

Example 93

Project: cvxpy Source File: Channel_capacity_BV4.57.py
Function: channel_capacity
def channel_capacity(n,m,sum_x=1):
  '''
Boyd and Vandenberghe, Convex Optimization, exercise 4.57 page 207
Capacity of a communication channel.
  
We consider a communication channel, with input x(t)∈{1,..,n} and
output Y(t)∈{1,...,m}, for t=1,2,... .The relation between the
input and output is given statistically:
p_(i,j) = ℙ(Y(t)=i|X(t)=j), i=1,..,m  j=1,...,m
The matrix P ∈ ℝ^(m*n) is called the channel transition matrix, and
the channel is called a discrete memoryless channel. Assuming X has a
probability distribution denoted x ∈ ℝ^n, i.e.,
x_j = ℙ(X=j), j=1,...,n
The mutual information between X and Y is given by
∑(∑(x_j p_(i,j)log_2(p_(i,j)/∑(x_k p_(i,k)))))
Then channel capacity C is given by
C = sup I(X;Y).
With a variable change of y = Px this becomes
I(X;Y)=  c^T x - ∑(y_i log_2 y_i)
where c_j = ∑(p_(i,j)log_2(p_(i,j)))
  '''
  # n is the number of different input values
  # m is the number of different output values
  if n*m == 0:
    print('The range of both input and output values must be greater than zero')
    return 'failed',np.nan,np.nan
  # P is the channel transition matrix
  P = np.ones((m,n))
  # x is probability distribution of the input signal X(t)
  x = cvx.Variable(rows=n,cols=1)
  # y is the probability distribution of the output signal Y(t)
  y = P*x
  # I is the mutual information between x and y
  c = np.sum(P*np.log2(P),axis=0)
  I = c*x + cvx.sum_entries(cvx.entr(y))
  # Channel capacity maximised by maximising the mutual information
  obj = cvx.Minimize(-I)
  constraints = [cvx.sum_entries(x) == sum_x,x >= 0]
  # Form and solve problem
  prob = cvx.Problem(obj,constraints)
  prob.solve()
  if prob.status=='optimal':
    return prob.status,prob.value,x.value
  else:
    return prob.status,np.nan,np.nan

Example 94

Project: statsmodels Source File: factormodels.py
    def fit_find_nfact(self, maxfact=None, skip_crossval=True, cv_iter=None):
        '''estimate the model and selection criteria for up to maxfact factors

        The selection criteria that are calculated are AIC, BIC, and R2_adj. and
        additionally cross-validation prediction error sum of squares if `skip_crossval`
        is false. Cross-validation is not used by default because it can be
        time consuming to calculate.

        By default the cross-validation method is Leave-one-out on the full dataset.
        A different cross-validation sample can be specified as an argument to
        cv_iter.

        Results are attached in `results_find_nfact`



        '''
        #print 'OLS on Factors'
        if not hasattr(self, 'factors'):
            self.calc_factors()

        hasconst = self.hasconst
        if maxfact is None:
            maxfact = self.factors.shape[1] - hasconst

        if (maxfact+hasconst) < 1:
            raise ValueError('nothing to do, number of factors (incl. constant) should ' +
                             'be at least 1')

        #temporary safety
        maxfact = min(maxfact, 10)

        y0 = self.endog
        results = []
        #xred, fact, eva, eve  = pca(x0, keepdim=0, normalize=1)
        for k in range(1, maxfact+hasconst): #k includes now the constnat
            #xred, fact, eva, eve  = pca(x0, keepdim=k, normalize=1)
            # this is faster and same result
            fact = self.factors[:,:k]
            res = sm.OLS(y0, fact).fit()
        ##    print 'k =', k
        ##    print res.params
        ##    print 'aic:  ', res.aic
        ##    print 'bic:  ', res.bic
        ##    print 'llf:  ', res.llf
        ##    print 'R2    ', res.rsquared
        ##    print 'R2 adj', res.rsquared_adj

            if not skip_crossval:
                if cv_iter is None:
                    cv_iter = LeaveOneOut(len(y0))
                prederr2 = 0.
                for inidx, outidx in cv_iter:
                    res_l1o = sm.OLS(y0[inidx], fact[inidx,:]).fit()
                    #print data.endog[outidx], res.model.predict(data.exog[outidx,:]),
                    prederr2 += (y0[outidx] -
                                 res_l1o.model.predict(res_l1o.params, fact[outidx,:]))**2.
            else:
                prederr2 = np.nan

            results.append([k, res.aic, res.bic, res.rsquared_adj, prederr2])

        self.results_find_nfact = results = np.array(results)
        self.best_nfact = np.r_[(np.argmin(results[:,1:3],0), np.argmax(results[:,3],0),
                     np.argmin(results[:,-1],0))]

Example 95

Project: cmonkey2 Source File: BSCM.py
def getVarianceMeanSDvect(ratioVect, n, tolerance = 0.01, maxTime=600, chunkSize=200, verbose=False,
                          expName=None):
    """Given a ratios matrix and a number of genes, figure out the expected distribution of variances
       Will sample background until the mean and sd converge or the operation times out
       Will return a list of variances to be used for statistical tests,
       or return nan if only nan values in ratioVect

     Keyword arguments:
     ratioVect  -- A a vector of ratios
     n          -- The number of genes to sample
     tolerance  -- The fraction tolance to use as a stopping condition (DEFAULT: 0.01)
     maxTime    -- The approximate maximum time to run in seconds (DEFAULT: 600)
     chunkSize  -- The number of samples to add between test (DEFAULT: 200)
     verbose    -- Set to false to suppress output (DEFAULT: False)
     expName    -- Set to echo this name if verbose = True (DEFAULT: None)

     Useage:
     varDist = getVarianceMeanSD(ratioVect, n)
    """
    ratioVect = [x for x in ratioVect if not math.isnan(x)]

    if verbose:
        logging.info("Calculating background for %d sampled from %d in %s", n, len(ratioVect), expName)

    if n <= 1 or n > len(ratioVect):
        return [np.nan]

    varList = []
    repeat = True
    startTime = dt.datetime.now()

    while repeat:
        newVars = []
        for i in range(0, chunkSize):
            curSample = random.sample(ratioVect, n)
            try:
                newVar = np.var(curSample)
            except:
                newVar = 0
            newVars.append(newVar)

        if len(varList) > 0: #True if past the first sample
            oldMean = np.mean(varList)
            oldVar = np.var(varList)
            varList = varList+newVars
            newMean = np.mean(varList)
            newVar = np.var(varList)
            meanWinTol = abs(newMean-oldMean) < tolerance*abs(oldMean)
            varWinTol = abs(oldVar-newVar) < tolerance*abs(oldVar)
            if meanWinTol and varWinTol:
                repeat = False
        else:
            varList = varList+newVars

        curTime = dt.datetime.now()
        if (curTime-startTime).seconds > maxTime:
            repeat = False

    return varList

Example 96

Project: pyspeckit Source File: cube_fit.py
def cube_fit(cubefilename, outfilename, errfilename=None, scale_keyword=None,
        vheight=False, verbose=False, signal_cut=3, verbose_level=2,
        clobber=True, **kwargs):
    """
    Light-weight wrapper for cube fitting

    Takes a cube and error map (error will be computed naively if not given)
    and computes moments then fits for each spectrum in the cube.  It then
    saves the fitted parameters to a reasonably descriptive output file whose
    header will look like ::

        PLANE1  = 'amplitude'
        PLANE2  = 'velocity'
        PLANE3  = 'sigma'
        PLANE4  = 'err_amplitude'
        PLANE5  = 'err_velocity'
        PLANE6  = 'err_sigma'
        PLANE7  = 'integral'
        PLANE8  = 'integral_error'
        CDELT3  = 1
        CTYPE3  = 'FITPAR'
        CRVAL3  = 0
        CRPIX3  = 1

    Parameters
    ----------
    errfilename: [ None | string name of .fits file ]
        A two-dimensional error map to use for computing signal-to-noise cuts
    scale_keyword: [ None | Char ]
        Keyword to pass to the data cube loader - multiplies cube by the number
        indexed by this header kwarg if it exists.  e.g., if your cube is in
        T_A units and you want T_A*
    vheight: [ bool ]
        Is there a background to be fit?  Used in moment computation
    verbose: [ bool ] 
    verbose_level: [ int ]
        How loud will the fitting procedure be?  Passed to momenteach and fiteach
    signal_cut: [ float ] 
        Signal-to-Noise ratio minimum.  Spectra with a peak below this S/N ratio
        will not be fit and will be left blank in the output fit parameter cube
    clobber: [ bool ] 
        Overwrite parameter .fits cube if it exists?

    `kwargs` are passed to :class:`pyspeckit.Spectrum.specfit`
    """

    # Load the spectrum
    sp = pyspeckit.Cube(cubefilename,scale_keyword=scale_keyword)
    if os.path.exists(errfilename):
        errmap = pyfits.getdata(errfilename)
    else:
        # very simple error calculation... biased and bad, needs improvement
        # try using something from the cubes package
        errmap = sp.cube.std(axis=0)

    # Run the fitter
    sp.mapplot()

    # Compute the moments at each position to come up with reasonable guesses.
    # This speeds up the process enormously, but can easily mess up the fits if
    # there are bad pixels
    sp.momenteach(vheight=vheight, verbose=verbose)
    sp.fiteach(errmap=errmap, multifit=None, verbose_level=verbose_level,
               signal_cut=signal_cut, usemomentcube=True, blank_value=np.nan,
               verbose=verbose, **kwargs)

    # steal the header from the error map
    f = pyfits.open(cubefilename)
    # start replacing components of the pyfits object
    f[0].data = np.concatenate([sp.parcube,sp.errcube,sp.integralmap])
    f[0].header['PLANE1'] = 'amplitude'
    f[0].header['PLANE2'] = 'velocity'
    f[0].header['PLANE3'] = 'sigma'
    f[0].header['PLANE4'] = 'err_amplitude'
    f[0].header['PLANE5'] = 'err_velocity'
    f[0].header['PLANE6'] = 'err_sigma'
    f[0].header['PLANE7'] = 'integral'
    f[0].header['PLANE8'] = 'integral_error'
    f[0].header['CDELT3'] = 1
    f[0].header['CTYPE3'] = 'FITPAR'
    f[0].header['CRVAL3'] = 0
    f[0].header['CRPIX3'] = 1
    # save your work
    f.writeto(outfilename, clobber=clobber)

    return sp

Example 97

Project: empyrical Source File: stats.py
def omega_ratio(returns, risk_free=0.0, required_return=0.0,
                annualization=APPROX_BDAYS_PER_YEAR):
    """Determines the Omega ratio of a strategy.

    Parameters
    ----------
    returns : pd.Series or np.ndarray
        Daily returns of the strategy, noncuemulative.
        - See full explanation in :func:`~empyrical.stats.cuem_returns`.
    risk_free : int, float
        Constant risk-free return throughout the period
    required_return : float, optional
        Minimum acceptance return of the investor. Threshold over which to
        consider positive vs negative returns. It will be converted to a
        value appropriate for the period of the returns. E.g. An annual minimum
        acceptable return of 100 will translate to a minimum acceptable
        return of 0.018.
    annualization : int, optional
        Factor used to convert the required_return into a daily
        value. Enter 1 if no time period conversion is necessary.

    Returns
    -------
    float
        Omega ratio.

    Note
    -----
    See https://en.wikipedia.org/wiki/Omega_ratio for more details.

    """

    if len(returns) < 2:
        return np.nan

    if annualization == 1:
        return_threshold = required_return
    elif required_return <= -1:
        return np.nan
    else:
        return_threshold = (1 + required_return) ** \
            (1. / annualization) - 1

    returns_less_thresh = returns - risk_free - return_threshold

    numer = sum(returns_less_thresh[returns_less_thresh > 0.0])
    denom = -1.0 * sum(returns_less_thresh[returns_less_thresh < 0.0])

    if denom > 0.0:
        return numer / denom
    else:
        return np.nan

Example 98

Project: zipline Source File: continuous_future_reader.py
Function: load_raw_arrays
    def load_raw_arrays(self, columns, start_date, end_date, assets):
        """
        Parameters
        ----------
        fields : list of str
           'open', 'high', 'low', 'close', or 'volume'
        start_dt: Timestamp
           Beginning of the window range.
        end_dt: Timestamp
           End of the window range.
        sids : list of int
           The asset identifiers in the window.

        Returns
        -------
        list of np.ndarray
            A list with an entry per field of ndarrays with shape
            (minutes in range, sids) with a dtype of float64, containing the
            values for the respective field over start and end dt range.
        """
        rolls_by_asset = {}

        tc = self.trading_calendar
        start_session = tc.minute_to_session_label(start_date)
        end_session = tc.minute_to_session_label(end_date)

        for asset in assets:
            rf = self._roll_finders[asset.roll_style]
            rolls_by_asset[asset] = rf.get_rolls(
                asset.root_symbol,
                start_session,
                end_session, asset.offset)

        sessions = tc.sessions_in_range(start_date, end_date)

        minutes = tc.minutes_in_range(start_date, end_date)
        num_minutes = len(minutes)
        shape = num_minutes, len(assets)

        results = []

        # Get partitions
        partitions_by_asset = {}
        for asset in assets:
            partitions = []
            partitions_by_asset[asset] = partitions
            rolls = rolls_by_asset[asset]
            start = start_date
            for roll in rolls:
                sid, roll_date = roll
                start_loc = minutes.searchsorted(start)
                if roll_date is not None:
                    _, end = tc.open_and_close_for_session(
                        roll_date - sessions.freq)
                    end_loc = minutes.searchsorted(end)
                else:
                    end = end_date
                    end_loc = len(minutes) - 1
                partitions.append((sid, start, end, start_loc, end_loc))
                if roll[-1] is not None:
                    start, _ = tc.open_and_close_for_session(
                        tc.minute_to_session_label(minutes[end_loc + 1]))

        for column in columns:
            if column != 'volume':
                out = np.full(shape, np.nan)
            else:
                out = np.zeros(shape, dtype=np.uint32)
            for i, asset in enumerate(assets):
                partitions = partitions_by_asset[asset]
                for sid, start, end, start_loc, end_loc in partitions:
                    if column != 'sid':
                        result = self._bar_reader.load_raw_arrays(
                            [column], start, end, [sid])[0][:, 0]
                    else:
                        result = int(sid)
                    out[start_loc:end_loc + 1, i] = result
            results.append(out)
        return results

Example 99

Project: scipy Source File: linsolve.py
Function: spsolve
def spsolve(A, b, permc_spec=None, use_umfpack=True):
    """Solve the sparse linear system Ax=b, where b may be a vector or a matrix.

    Parameters
    ----------
    A : ndarray or sparse matrix
        The square matrix A will be converted into CSC or CSR form
    b : ndarray or sparse matrix
        The matrix or vector representing the right hand side of the equation.
        If a vector, b.shape must be (n,) or (n, 1).
    permc_spec : str, optional
        How to permute the columns of the matrix for sparsity preservation.
        (default: 'COLAMD')

        - ``NATURAL``: natural ordering.
        - ``MMD_ATA``: minimum degree ordering on the structure of A^T A.
        - ``MMD_AT_PLUS_A``: minimum degree ordering on the structure of A^T+A.
        - ``COLAMD``: approximate minimum degree column ordering
    use_umfpack : bool, optional
        if True (default) then use umfpack for the solution.  This is
        only referenced if b is a vector and ``scikit-umfpack`` is installed.

    Returns
    -------
    x : ndarray or sparse matrix
        the solution of the sparse linear equation.
        If b is a vector, then x is a vector of size A.shape[1]
        If b is a matrix, then x is a matrix of size (A.shape[1], b.shape[1])

    Notes
    -----
    For solving the matrix expression AX = B, this solver assumes the resulting
    matrix X is sparse, as is often the case for very sparse inputs.  If the
    resulting X is dense, the construction of this sparse result will be
    relatively expensive.  In that case, consider converting A to a dense
    matrix and using scipy.linalg.solve or its variants.
    """
    if not (isspmatrix_csc(A) or isspmatrix_csr(A)):
        A = csc_matrix(A)
        warn('spsolve requires A be CSC or CSR matrix format',
                SparseEfficiencyWarning)

    # b is a vector only if b have shape (n,) or (n, 1)
    b_is_sparse = isspmatrix(b)
    if not b_is_sparse:
        b = asarray(b)
    b_is_vector = ((b.ndim == 1) or (b.ndim == 2 and b.shape[1] == 1))

    A.sort_indices()
    A = A.asfptype()  # upcast to a floating point format

    # validate input shapes
    M, N = A.shape
    if (M != N):
        raise ValueError("matrix must be square (has shape %s)" % ((M, N),))

    if M != b.shape[0]:
        raise ValueError("matrix - rhs dimension mismatch (%s - %s)"
                         % (A.shape, b.shape[0]))

    use_umfpack = use_umfpack and useUmfpack

    if b_is_vector and use_umfpack:
        if b_is_sparse:
            b_vec = b.toarray()
        else:
            b_vec = b
        b_vec = asarray(b_vec, dtype=A.dtype).ravel()

        if noScikit:
            raise RuntimeError('Scikits.umfpack not installed.')

        if A.dtype.char not in 'dD':
            raise ValueError("convert matrix data to double, please, using"
                  " .astype(), or set linsolve.useUmfpack = False")

        umf = umfpack.UmfpackContext(_get_umf_family(A))
        x = umf.linsolve(umfpack.UMFPACK_A, A, b_vec,
                         autoTranspose=True)
    else:
        if b_is_vector and b_is_sparse:
            b = b.toarray()
            b_is_sparse = False

        if not b_is_sparse:
            if isspmatrix_csc(A):
                flag = 1  # CSC format
            else:
                flag = 0  # CSR format

            options = dict(ColPerm=permc_spec)
            x, info = _superlu.gssv(N, A.nnz, A.data, A.indices, A.indptr,
                                    b, flag, options=options)
            if info != 0:
                warn("Matrix is exactly singular", MatrixRankWarning)
                x.fill(np.nan)
            if b_is_vector:
                x = x.ravel()
        else:
            # b is sparse
            Afactsolve = factorized(A)

            if not isspmatrix_csc(b):
                warn('spsolve is more efficient when sparse b '
                     'is in the CSC matrix format', SparseEfficiencyWarning)
                b = csc_matrix(b)

            # Create a sparse output matrix by repeatedly applying
            # the sparse factorization to solve columns of b.
            data_segs = []
            row_segs = []
            col_segs = []
            for j in range(b.shape[1]):
                bj = b[:, j].A.ravel()
                xj = Afactsolve(bj)
                w = np.flatnonzero(xj)
                segment_length = w.shape[0]
                row_segs.append(w)
                col_segs.append(np.ones(segment_length, dtype=int)*j)
                data_segs.append(np.asarray(xj[w], dtype=A.dtype))
            sparse_data = np.concatenate(data_segs)
            sparse_row = np.concatenate(row_segs)
            sparse_col = np.concatenate(col_segs)
            x = A.__class__((sparse_data, (sparse_row, sparse_col)),
                           shape=b.shape, dtype=A.dtype)

    return x

Example 100

Project: orange3-text Source File: pubmed.py
def _date_to_iso(date):
    possible_date_formats = [
        '%Y %b %d',
        '%Y %b',
        '%Y',
    ]

    season_mapping = {
        'fall': 'Sep',
        'autumn': 'Sep',
        'winter': 'Dec',
        'spring': 'Mar',
        'summer': 'Jun',
    }

    date = date.lower()
    # Seasons to their respective months.
    for season, month in season_mapping.items():
        date = date.replace(season, month)
    date = date.split('-')[0]  # 2015 Sep-Dec --> 2015 Sep

    time_var = TimeVariable()
    for date_format in possible_date_formats:
        try:
            date_string = datetime.strptime(
                    date, date_format
            ).date().isoformat()
            return time_var.parse(date_string)
        except ValueError:
            continue  # Try the next format.

    warnings.warn(
            'Could not parse "{}" into a date.'.format(date),
            RuntimeWarning
    )
    return time_var.parse(np.nan)
See More Examples - Go to Next Page
Page 1 Page 2 Selected Page 3 Page 4