Untitled

library("RODBC");
library("RecordLinkage");
library("RSQLite");
library("gsubfn");
library("proto");
library("chron");
library("RSQLite.extfuns");
library("sqldf");
library("plyr");
memory.limit(400000);

channel<-odbcConnect("replica", "kunalbatra");
allproducts1<-sqlQuery(channel,"SELECT DISTINCT dc.product_offer_group_id AS id, dp.subcategory_id, f.filter_name,
fv.value, dp.brand_id, pvpd.pp_price as price, pvpd.pp_selling_price as selling_price, CONVERT(dp.name USING utf8) AS title,0 as price_rank,
0 as sp_rank FROM dwh.d_catalog_product dc JOIN dwh.d_product dp ON dp.product_offer_id = dc.id AND dp.subcategory_id is not null and dp.subcategory_id=189 and dp.subcategory_id not in (84, 85, 86) join dwh.lkp_product_vendor_product_details pvpd on pvpd.popm_product_offer_id=dc.id
left JOIN snapdeal.filter_value fv ON fv.product_offer_id = dp.product_offer_id left JOIN snapdeal.filter f ON f.id = fv.filter_id GROUP BY 1");

allliveproducts1<-sqlQuery(channel,"SELECT dc.product_offer_group_id as id, dp.subcategory_id, f.filter_name,
fv.value, dp.brand_id, pvpd.pp_price as price, pvpd.pp_selling_price as selling_price, convert(dp.name using utf8) as title,0 as price_rank,
0 as sp_rank FROM dwh.d_catalog_product dc JOIN dwh.d_product dp ON dp.product_offer_id = dc.id and dp.subcategory_id IS NOT NULL and dp.subcategory_id=189 and dp.subcategory_id not in (19,84,85,86,364) join dwh.lkp_product_vendor_product_details pvpd on pvpd.popm_product_offer_id=dc.id
left JOIN snapdeal.filter_value fv ON fv.product_offer_id = dp.product_offer_id left join snapdeal.filter f on f.id = fv.filter_id where dc.catalog_status_id=1 group by 1
HAVING CURDATE() BETWEEN MAX(DATE(dc.start_time)) AND MAX(DATE(dc.end_time))");

allsubcategories<-sqlQuery(channel, "select pc.id from snapdeal.product_category pc where pc.parent_category_id is not null and pc.id=189 and pc.visible=1 and pc.id not in (19,84,85,86,364)");

similarityvalues<-read.csv("/home/kunalbatra/personalisation_kunal/Similar_products/recommendation_data.csv", header=TRUE);

filterweights<-read.csv("/home/kunalbatra/personalisation_kunal/Similar_products/filter_weight.csv", header=TRUE);

try
{
    for(catitr in 1:nrow(allsubcategories))
    {

        logger = 1;

        if(logger==1) print("entered loop");

        allproducts<-subset(allproducts1,subcategory_id==allsubcategories[catitr,1]);
        if(logger==1) print(catitr);
        if(logger==1) print(allsubcategories[catitr,1]);
        if(logger==1) print(nrow(allproducts));
        if(logger==1) print(Sys.time());
        dt1<-as.POSIXct(Sys.time());

        try
        {

            if(nrow(allproducts)<3000)
            {
                allproducts[,8]<-gsub("[^[:alnum:] ]","",allproducts[,8]);

                allliveproducts<-subset(allliveproducts1,subcategory_id==allsubcategories[catitr,1]);

                allliveproducts[,8]<-gsub("[^[:alnum:] ]","",allliveproducts[,8]);

                if(logger==1) print("query finished. ranking now");

                for(i in 1:nrow(allproducts))
                {
                    allproducts[i,9]<-(nrow(allproducts[(allproducts[i,6]>=allproducts$price) & (allproducts[i,2]==allproducts$subcategory_id),])/nrow(allproducts[(allproducts[i,2]==allproducts$subcategory_id),]))*100;
                    allproducts[i,10]<-(nrow(allproducts[(allproducts[i,7]>=allproducts$selling_price) & (allproducts[i,2]==allproducts$subcategory_id),])/nrow(allproducts[(allproducts[i,2]==allproducts$subcategory_id),]))*100;

                }

                for(i in 1:nrow(allliveproducts))
                {
                    allliveproducts[i,9]<-(nrow(allliveproducts[(allliveproducts[i,6]>=allliveproducts$price) & (allliveproducts[i,2]==allliveproducts$subcategory_id),])/nrow(allliveproducts[(allliveproducts[i,2]==allliveproducts$subcategory_id),]))*100;
                    allliveproducts[i,10]<-(nrow(allliveproducts[(allliveproducts[i,7]>=allliveproducts$selling_price) & (allliveproducts[i,2]==allliveproducts$subcategory_id),])/nrow(allliveproducts[(allliveproducts[i,2]==allliveproducts$subcategory_id),]))*100;

                }

                score<-sqldf("select x.*,case when y.Weights is null then x.Similarity*1 else x.Similarity*y.Weights end as final_score
                from similarityvalues x left join filterweights y on x.subcategory_id=y.subcategory and
                x.filter_name=y.filter_name");

                compareprod<-sqldf("SELECT mp.id as mp_id, sp.id as sp_id, sp.subcategory_id, mp.filter_name, mp.value as mp_value, sp.value,case when s.final_score is null then 0 else s.final_score end as final_score
                FROM allproducts mp
                join allliveproducts sp on sp.filter_name = mp.filter_name and sp.subcategory_id = mp.subcategory_id
                left join score s on sp.subcategory_id=s.subcategory_id and s.filter_name=sp.filter_name and mp.value=s.value1 and sp.value=s.value2 where sp.id > mp.id order by 1");

                try
                {
                    if(nrow(compareprod)>1)
                    {
                        attributes<-sqldf("select distinct mp.id as mp_id, sp.id as sp_id, mp.subcategory_id, mp.brand_id as mp_brandID, sp.brand_id,
                        case when mp.brand_id=sp.brand_id then x.weights else 0 end as brand_weight, mp.price_rank as mp_priceRank, sp.price_rank, mp.sp_rank as mp_spRank, sp.sp_rank
                        FROM allproducts mp join allproducts sp on sp.id > mp.id and sp.subcategory_id = mp.subcategory_id
                        left join filterweights x on mp.subcategory_id=x.subcategory and x.filter_name='Brand'");


                        attributes1<-sqldf("select distinct a.*,
                        x.weights*(1-(abs(a.mp_priceRank-a.price_rank)/100)) as price_weight from
                        attributes as a
                        left join filterweights as x on a.subcategory_id=x.subcategory and x.filter_name='price'");

                        rm(attributes);

                        sqldf("create index ix6 on attributes1(subcategory_id)");

                        attributes2<-sqldf("select distinct a.*,
                        x.weights*(1-(abs(a.mp_spRank-a.sp_rank)/100)) as selling_price_weight
                        from attributes1 as a
                        left join filterweights as x on a.subcategory_id=x.subcategory and x.filter_name='selling_price'");

                        rm(attributes1);

                        if(logger==1) print("attributes fat tables done. string comprison started");

                        sqldf("create index ix4 on allproducts(subcategory_id)");
                        sqldf("create index ix5 on filterweights(subcategory)");

                        title_score1<-sqldf("select a.id as mp_id, b.id as sp_id, a.title as a_title, b.title as b_title, c.weights as c_weights
                        from allproducts a
                        join allproducts b on a.subcategory_id=b.subcategory_id and a.id > b.id
                        join filterweights c on b.subcategory_id=c.subcategory and c.filter_name='title'");

                        try
                        {

                            if(nrow(title_score1)>=1)
                            {
                                title_score1[5:6,"title_score"]<-NA

                                if(logger==1) print("attributes fat tables done. levenshtein  started");

                                title_score1[,6]<-levenshteinSim(title_score1[,3],title_score1[,4]);

                                if(logger==1) print("levenshtein finished");

                                title_score1[,6]<-(title_score1[,6])*title_score1[,5];

                                if(logger==1) print("string comprison done. final scoring started");

                                title_score1 <- subset(title_score1, select=c(mp_id,sp_id,title_score));

                                names(title_score1)[names(title_score1)=="mp_id"] <- "sp_id1";
                                names(title_score1)[names(title_score1)=="sp_id"] <- "mp_id1";
                                names(title_score1)[names(title_score1)=="mp_id1"] <- "mp_id";
                                names(title_score1)[names(title_score1)=="sp_id1"] <- "sp_id";

                                attributes3<-merge(attributes2, title_score1, by= c("mp_id", "sp_id"));

                                rm(attributes2);
                                rm(title_score1);

                                sqldf("create index ix1 on compareprod(mp_id)");
                                sqldf("create index ix2 on attributes3(mp_id)");

                                output_test_full<-merge(attributes3,compareprod, by=c('mp_id','sp_id'),all=TRUE);

                                output_test_full[["final_score"]][is.na(output_test_full[["final_score"]])]<-0;

                                output_full_test1<-sqldf("select mp_id,sp_id, sum(final_score) as sum, brand_weight, price_weight, selling_price_weight, title_score from output_test_full group by 1,2");

                                output<-sqldf("select mp_id,sp_id,sum, brand_weight,price_weight,selling_price_weight,title_score, (2*sum + brand_weight + 2*price_weight + 2*selling_price_weight + title_score)/
                                (1 + price_weight + selling_price_weight + sum) as score_final from output_full_test1 group by 1,2");


                                rm(compareprod);

                                output1<-output;


                                names(output1)[names(output1)=="mp_id"] <- "sp_id1";
                                names(output1)[names(output1)=="sp_id"] <- "mp_id1";
                                names(output1)[names(output1)=="mp_id1"] <- "mp_id";
                                names(output1)[names(output1)=="sp_id1"] <- "sp_id";

                                allsimilar<-sqldf("select mp_id, sp_id, score_final
                                from output
                                UNION
                                select mp_id, sp_id, score_final
                                from output1");

                                rm(output);
                                rm(output1);

                                if(logger==1) print("final scoring done. top and live checks");

                                liveproducts<-sqlQuery(channel, "SELECT product_offer_group_id as sp_id,0 as dummy FROM dwh.d_catalog_product WHERE
                                CURDATE() BETWEEN DATE(start_time) AND DATE(end_time) AND catalog_status_id=1");

                                liveproducts<-liveproducts[order(liveproducts$sp_id),];

                                allsimilar<-allsimilar[order(allsimilar$sp_id),];

                                allsimilar1<-merge(allsimilar, liveproducts,by=c("sp_id"));

                                allsimilar_final1<-sqldf("select mp_id,sp_id,max(score_final) as score_final from allsimilar1 group by 1,2");

                                allsimilar_final<-sqldf("select * from allsimilar_final1 order by mp_id,score_final desc");

                                topsimilar<-ddply(allsimilar_final, "mp_id", function(df) df[1:min(nrow(df), 12),]);

                                topsimilar_final <- subset(topsimilar, select=c(mp_id,sp_id,score_final));

                                if(logger==1) print("Phod diya");

                                dt2<-as.POSIXct(Sys.time());

                                timedif<-difftime(dt2,dt1,units="secs");
                                print("process completed in");
                                print(timedif);
                                try
                                {

                                    if(catitr==1)
                                    {
                                        write.table(topsimilar_final, file = "similar_output.csv", append=F, sep=",",col.names=T);
                                    } else
                                    {
                                        write.table(topsimilar_final, file = "similar_output.csv", append=T, sep=",",col.names=F);
                                    }

                                }catch(Exception e) {
                                            }

                                rm(topsimilar);

                                gc();
                            }else
                            {
                                print("process failure for subcategory id");
                                print(allsubcategories[catitr,1]);
                            }
                        } catch(Exception e) {
                                    }
                }else
                {
                    print("process failure for subcategory id");
                    print(allsubcategories[catitr,1]);
                }
            }catch(Exception e) {
            }
        } else {}
    }catch(Exception e) {
                }
   }
}catch(Exception e) {
            }