Untitled

#include <iostream>
#include <vector>
#include <algorithm>
#include <numeric>
#include <tr1/array>
#include <boost/mpl/size_t.hpp>

#include <boost/timer.hpp>

#include <boost/proto/proto.hpp>
#include <boost/tuple/tuple.hpp>

namespace proto = boost::proto;
namespace mpl = boost::mpl;

typedef boost::timer timer_type;

struct tuple_get : proto::callable
{
    template<typename Sig> struct result;

    // this overload seems to be needed
    template<typename This, typename ArgNTag, typename Tuple>
    struct result<This(ArgNTag, Tuple &)>
    {
    // ArgNTag is an instance of argN_tag, which has a nested index typedef
    typedef typename ArgNTag::index N;
    // boost::tuples::element is a trait that returns the type of the Nth element
    typedef typename boost::tuples::element<N::value, Tuple>::type type;
    };

    template<typename This, typename ArgNTag, typename Tuple>
    struct result<This(ArgNTag const &, Tuple const &)>
    {
    // ArgNTag is an instance of argN_tag, which has a nested index typedef
    typedef typename ArgNTag::index N;
    // boost::tuples::element is a trait that returns the type of the Nth element
    typedef typename boost::tuples::element<N::value, Tuple>::type type;
    };

    template<typename ArgNTag, typename Tuple>
    typename result<tuple_get(ArgNTag const &, Tuple const &)>::type
    operator()(ArgNTag const &, Tuple const & tup) const
    {
    typedef typename ArgNTag::index N;
    // boost::get is a function that returns the Nth element of a tuple.
    return boost::get<N::value>(tup);
    }
};

namespace linear_algebra
{
    const size_t n = 512;

    typedef std::tr1::array< double , n > state_type;

    template<typename T > struct is_terminal : mpl::false_ {};
    template<> struct is_terminal< state_type > : mpl::true_ {};
    template<> struct is_terminal< double > : mpl::true_ {};

    template<typename Index> struct argN_tag { typedef Index index; };
    template<typename Index> struct argN_term : proto::terminal< argN_tag<Index> > {};

    argN_term<mpl::size_t<0> >::type const arg1 = {};
    argN_term<mpl::size_t<1> >::type const arg2 = {};

    BOOST_PROTO_DEFINE_OPERATORS( is_terminal , proto::default_domain )

    struct vector_context : proto::callable_context< vector_context const >
    {
        size_t m_i;

        vector_context( size_t i ) : m_i( i ) { }

        typedef double result_type;

        double operator()( proto::tag::terminal , state_type & arr ) const
        {
            return arr[ m_i ];
        }
    };

    using namespace boost::proto;

    // Take any expression and turn each node
    // into a subscript expression, using the
    // state as the RHS.
    struct Distribute:
      proto::or_<
        when<terminal<state_type>, _make_subscript(_, _state)>,
        when<terminal<argN_tag<_> >, _make_subscript(_, _state)>,
        terminal<_>,
        nary_expr<_, vararg<Distribute> >
      >
    {};

    struct Optimize:
      or_<
        when<

          subscript<Distribute, terminal<_> >,
          Distribute(_left, _right)
        >,
        nary_expr<_, vararg<Optimize> >,
        terminal<_>
      >
    {};

    struct EvalOpt:
        when<Distribute, _default<>(Optimize)>
    {};

    struct EvalOpt2:
      or_<
        when< terminal< argN_tag< _ > >
        , tuple_get(_value(_), _state)
        >,
        otherwise< _default< EvalOpt2>( Optimize(proto::_), proto::_state) >
      >
    {};
}

template< typename Expr >
void assign_proto( linear_algebra::state_type &x , Expr const & expr ) __attribute__((noinline));

template< typename Expr >
void assign_proto( linear_algebra::state_type &x , Expr const & expr )
{
    using namespace linear_algebra;
    for( size_t i=0 ; i<n ; ++i )
    {
        vector_context ctx( i );
        x[i] = proto::eval( expr , ctx );
    }
}

template< typename Expr >
void assign_proto_trans( linear_algebra::state_type &x, const Expr& expr) __attribute__((noinline));

template< typename Expr >
void assign_proto_trans( linear_algebra::state_type &x , const Expr& expr)
{
    using namespace linear_algebra;

    for( size_t i=0 ; i<n ; ++i )
    {
        x[i] = EvalOpt()( expr[i] );
    }
}

template< typename Expr >
void assign_proto_trans2( linear_algebra::state_type &x,
                          linear_algebra::state_type const& x1,
                          linear_algebra::state_type const& x2,
                          const Expr& expr) __attribute__((noinline));

template< typename Expr >
void assign_proto_trans2( linear_algebra::state_type &x ,
                          linear_algebra::state_type const& x1,
                          linear_algebra::state_type const& x2,
                          const Expr& expr)
{
    using namespace linear_algebra;

    for( size_t i=0 ; i<n ; ++i )
    {
        // notice boost::ref here, its important
        x[i] = EvalOpt2()( expr[i], boost::make_tuple(boost::ref(x1),boost::ref(x2)) );
    }
}

void assign_full( linear_algebra::state_type &x3 ,
                  const linear_algebra::state_type &x1 ,
                  const linear_algebra::state_type &x2 ) __attribute__((noinline));

void assign_full( linear_algebra::state_type &x3,
                  linear_algebra::state_type const &x1,
                  linear_algebra::state_type const &x2 )
{
    for( size_t i=0 ; i<linear_algebra::n ; ++i )
        x3[i] = x1[i] + x2[i] * 2.0 + x1[i] * 10.0 + 2.0 * ( x1[i] + 3.0 * x2[i] );
}

int main( int argc , char **argv )
{
    using namespace linear_algebra;

    std::clog << n << std::endl;

    const size_t num_of_steps = 1000000;

    double t1 = 0.0 , t2 = 0.0, t3 = 0.0, t4 = 0.0;
    size_t count = 0;
    timer_type timer;

    while( true )
    {
        ++count;
        state_type x1 , x2 , x3_1 , x3_2, x3_3, x3_4;
        std::fill( x3_1.begin() , x3_1.end() , 0.0 );
        std::fill( x3_2.begin() , x3_2.end() , 0.0 );
        std::fill( x3_3.begin() , x3_3.end() , 0.0 );
        std::fill( x3_4.begin() , x3_4.end() , 0.0 );

        for( size_t i=0 ; i<n ; ++i )
        {
            x1[i] = drand48() - 0.5;
            x2[i] = drand48() - 0.5;
        }

        timer.restart();
        for( size_t i=0 ; i<num_of_steps ; ++i )
            assign_proto( x3_1 , x1 + x2 * 2.0 + x1 * 10.0 + 2.0 * ( x1 + 3.0 * x2 ) );
        t1 += timer.elapsed();

        timer.restart();
        for( size_t i=0 ; i<num_of_steps ; ++i )
            assign_full( x3_2 , x1 , x2 );
        t2 += timer.elapsed();

        timer.restart();
        for( size_t i=0 ; i<num_of_steps ; ++i )
            assign_proto_trans( x3_3 , x1 + x2 * 2.0 + x1 * 10.0 + 2.0 * ( x1 + 3.0 * x2 ) );
        t3 += timer.elapsed();

        timer.restart();
        for( size_t i=0 ; i<num_of_steps ; ++i )
            assign_proto_trans2( x3_4, x1, x2, arg1 + arg2*2.0 + arg1 * 10.0 + 2.0 * (arg1 + 3.0*arg2));
        t4 += timer.elapsed();


        std::clog.precision( 8 );
        std::clog.width( 10 );
        std::clog << count << " " << t1 / double( count )
                  << " " << t3 / double( count )
                  << " " << t4 / double( count )
                  << " " << t2 / double( count ) << " ";

        std::clog << std::accumulate( x3_1.begin() , x3_1.end() , 0.0 ) << " ";
        std::clog << std::accumulate( x3_3.begin() , x3_3.end() , 0.0 ) << " ";
        std::clog << std::accumulate( x3_4.begin() , x3_4.end() , 0.0 ) << " ";
        std::clog << std::accumulate( x3_2.begin() , x3_2.end() , 0.0 ) << std::endl;
    }

    return 0;
}