Advertisement
Guest User

Untitled

a guest
Mar 16th, 2019
211
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
R 10.43 KB | None | 0 0
  1. library("RODBC");
  2. library("RecordLinkage");
  3. library("RSQLite");
  4. library("gsubfn");
  5. library("proto");
  6. library("chron");
  7. library("RSQLite.extfuns");
  8. library("sqldf");
  9. library("plyr");
  10. memory.limit(400000);
  11.  
  12. channel<-odbcConnect("replica", "kunalbatra");
  13. allproducts1<-sqlQuery(channel,"SELECT DISTINCT dc.product_offer_group_id AS id, dp.subcategory_id, f.filter_name,
  14. fv.value, dp.brand_id, pvpd.pp_price as price, pvpd.pp_selling_price as selling_price, CONVERT(dp.name USING utf8) AS title,0 as price_rank,
  15. 0 as sp_rank FROM dwh.d_catalog_product dc JOIN dwh.d_product dp ON dp.product_offer_id = dc.id AND dp.subcategory_id is not null and dp.subcategory_id=189 and dp.subcategory_id not in (84, 85, 86) join dwh.lkp_product_vendor_product_details pvpd on pvpd.popm_product_offer_id=dc.id
  16. left JOIN snapdeal.filter_value fv ON fv.product_offer_id = dp.product_offer_id left JOIN snapdeal.filter f ON f.id = fv.filter_id GROUP BY 1");
  17.  
  18. allliveproducts1<-sqlQuery(channel,"SELECT dc.product_offer_group_id as id, dp.subcategory_id, f.filter_name,
  19. fv.value, dp.brand_id, pvpd.pp_price as price, pvpd.pp_selling_price as selling_price, convert(dp.name using utf8) as title,0 as price_rank,
  20. 0 as sp_rank FROM dwh.d_catalog_product dc JOIN dwh.d_product dp ON dp.product_offer_id = dc.id and dp.subcategory_id IS NOT NULL and dp.subcategory_id=189 and dp.subcategory_id not in (19,84,85,86,364) join dwh.lkp_product_vendor_product_details pvpd on pvpd.popm_product_offer_id=dc.id
  21. left JOIN snapdeal.filter_value fv ON fv.product_offer_id = dp.product_offer_id left join snapdeal.filter f on f.id = fv.filter_id where dc.catalog_status_id=1 group by 1
  22. HAVING CURDATE() BETWEEN MAX(DATE(dc.start_time)) AND MAX(DATE(dc.end_time))");
  23.  
  24. allsubcategories<-sqlQuery(channel, "select pc.id from snapdeal.product_category pc where pc.parent_category_id is not null and pc.id=189 and pc.visible=1 and pc.id not in (19,84,85,86,364)");
  25.  
  26. similarityvalues<-read.csv("/home/kunalbatra/personalisation_kunal/Similar_products/recommendation_data.csv", header=TRUE);
  27.  
  28. filterweights<-read.csv("/home/kunalbatra/personalisation_kunal/Similar_products/filter_weight.csv", header=TRUE);
  29.  
  30. try
  31. {
  32.     for(catitr in 1:nrow(allsubcategories))
  33.     {
  34.  
  35.         logger = 1;
  36.  
  37.         if(logger==1) print("entered loop");
  38.  
  39.         allproducts<-subset(allproducts1,subcategory_id==allsubcategories[catitr,1]);
  40.         if(logger==1) print(catitr);
  41.         if(logger==1) print(allsubcategories[catitr,1]);
  42.         if(logger==1) print(nrow(allproducts));
  43.         if(logger==1) print(Sys.time());
  44.         dt1<-as.POSIXct(Sys.time());
  45.        
  46.         try
  47.         {
  48.  
  49.             if(nrow(allproducts)<3000)
  50.             {
  51.                 allproducts[,8]<-gsub("[^[:alnum:] ]","",allproducts[,8]);
  52.    
  53.                 allliveproducts<-subset(allliveproducts1,subcategory_id==allsubcategories[catitr,1]);
  54.  
  55.                 allliveproducts[,8]<-gsub("[^[:alnum:] ]","",allliveproducts[,8]);
  56.  
  57.                 if(logger==1) print("query finished. ranking now");
  58.  
  59.                 for(i in 1:nrow(allproducts))
  60.                 {
  61.                     allproducts[i,9]<-(nrow(allproducts[(allproducts[i,6]>=allproducts$price) & (allproducts[i,2]==allproducts$subcategory_id),])/nrow(allproducts[(allproducts[i,2]==allproducts$subcategory_id),]))*100;
  62.                     allproducts[i,10]<-(nrow(allproducts[(allproducts[i,7]>=allproducts$selling_price) & (allproducts[i,2]==allproducts$subcategory_id),])/nrow(allproducts[(allproducts[i,2]==allproducts$subcategory_id),]))*100;
  63.  
  64.                 }
  65.  
  66.                 for(i in 1:nrow(allliveproducts))
  67.                 {
  68.                     allliveproducts[i,9]<-(nrow(allliveproducts[(allliveproducts[i,6]>=allliveproducts$price) & (allliveproducts[i,2]==allliveproducts$subcategory_id),])/nrow(allliveproducts[(allliveproducts[i,2]==allliveproducts$subcategory_id),]))*100;
  69.                     allliveproducts[i,10]<-(nrow(allliveproducts[(allliveproducts[i,7]>=allliveproducts$selling_price) & (allliveproducts[i,2]==allliveproducts$subcategory_id),])/nrow(allliveproducts[(allliveproducts[i,2]==allliveproducts$subcategory_id),]))*100;
  70.  
  71.                 }
  72.  
  73.                 score<-sqldf("select x.*,case when y.Weights is null then x.Similarity*1 else x.Similarity*y.Weights end as final_score
  74.                 from similarityvalues x left join filterweights y on x.subcategory_id=y.subcategory and
  75.                 x.filter_name=y.filter_name");
  76.  
  77.                 compareprod<-sqldf("SELECT mp.id as mp_id, sp.id as sp_id, sp.subcategory_id, mp.filter_name, mp.value as mp_value, sp.value,case when s.final_score is null then 0 else s.final_score end as final_score
  78.                 FROM allproducts mp
  79.                 join allliveproducts sp on sp.filter_name = mp.filter_name and sp.subcategory_id = mp.subcategory_id
  80.                 left join score s on sp.subcategory_id=s.subcategory_id and s.filter_name=sp.filter_name and mp.value=s.value1 and sp.value=s.value2 where sp.id > mp.id order by 1");
  81.  
  82.                 try
  83.                 {
  84.                     if(nrow(compareprod)>1)
  85.                     {
  86.                         attributes<-sqldf("select distinct mp.id as mp_id, sp.id as sp_id, mp.subcategory_id, mp.brand_id as mp_brandID, sp.brand_id,
  87.                         case when mp.brand_id=sp.brand_id then x.weights else 0 end as brand_weight, mp.price_rank as mp_priceRank, sp.price_rank, mp.sp_rank as mp_spRank, sp.sp_rank
  88.                         FROM allproducts mp join allproducts sp on sp.id > mp.id and sp.subcategory_id = mp.subcategory_id
  89.                         left join filterweights x on mp.subcategory_id=x.subcategory and x.filter_name='Brand'");
  90.  
  91.  
  92.                         attributes1<-sqldf("select distinct a.*,
  93.                         x.weights*(1-(abs(a.mp_priceRank-a.price_rank)/100)) as price_weight from
  94.                         attributes as a
  95.                         left join filterweights as x on a.subcategory_id=x.subcategory and x.filter_name='price'");
  96.  
  97.                         rm(attributes);
  98.  
  99.                         sqldf("create index ix6 on attributes1(subcategory_id)");
  100.  
  101.                         attributes2<-sqldf("select distinct a.*,
  102.                         x.weights*(1-(abs(a.mp_spRank-a.sp_rank)/100)) as selling_price_weight
  103.                         from attributes1 as a
  104.                         left join filterweights as x on a.subcategory_id=x.subcategory and x.filter_name='selling_price'");
  105.  
  106.                         rm(attributes1);
  107.  
  108.                         if(logger==1) print("attributes fat tables done. string comprison started");
  109.  
  110.                         sqldf("create index ix4 on allproducts(subcategory_id)");
  111.                         sqldf("create index ix5 on filterweights(subcategory)");
  112.  
  113.                         title_score1<-sqldf("select a.id as mp_id, b.id as sp_id, a.title as a_title, b.title as b_title, c.weights as c_weights
  114.                         from allproducts a
  115.                         join allproducts b on a.subcategory_id=b.subcategory_id and a.id > b.id
  116.                         join filterweights c on b.subcategory_id=c.subcategory and c.filter_name='title'");
  117.  
  118.                         try
  119.                         {
  120.  
  121.                             if(nrow(title_score1)>=1)
  122.                             {
  123.                                 title_score1[5:6,"title_score"]<-NA
  124.  
  125.                                 if(logger==1) print("attributes fat tables done. levenshtein  started");
  126.  
  127.                                 title_score1[,6]<-levenshteinSim(title_score1[,3],title_score1[,4]);
  128.  
  129.                                 if(logger==1) print("levenshtein finished");
  130.  
  131.                                 title_score1[,6]<-(title_score1[,6])*title_score1[,5];
  132.  
  133.                                 if(logger==1) print("string comprison done. final scoring started");
  134.  
  135.                                 title_score1 <- subset(title_score1, select=c(mp_id,sp_id,title_score));
  136.  
  137.                                 names(title_score1)[names(title_score1)=="mp_id"] <- "sp_id1";
  138.                                 names(title_score1)[names(title_score1)=="sp_id"] <- "mp_id1";
  139.                                 names(title_score1)[names(title_score1)=="mp_id1"] <- "mp_id";
  140.                                 names(title_score1)[names(title_score1)=="sp_id1"] <- "sp_id";
  141.  
  142.                                 attributes3<-merge(attributes2, title_score1, by= c("mp_id", "sp_id"));
  143.                
  144.                                 rm(attributes2);
  145.                                 rm(title_score1);
  146.  
  147.                                 sqldf("create index ix1 on compareprod(mp_id)");
  148.                                 sqldf("create index ix2 on attributes3(mp_id)");
  149.  
  150.                                 output_test_full<-merge(attributes3,compareprod, by=c('mp_id','sp_id'),all=TRUE);
  151.  
  152.                                 output_test_full[["final_score"]][is.na(output_test_full[["final_score"]])]<-0;
  153.  
  154.                                 output_full_test1<-sqldf("select mp_id,sp_id, sum(final_score) as sum, brand_weight, price_weight, selling_price_weight, title_score from output_test_full group by 1,2");
  155.  
  156.                                 output<-sqldf("select mp_id,sp_id,sum, brand_weight,price_weight,selling_price_weight,title_score, (2*sum + brand_weight + 2*price_weight + 2*selling_price_weight + title_score)/
  157.                                 (1 + price_weight + selling_price_weight + sum) as score_final from output_full_test1 group by 1,2");
  158.  
  159.  
  160.                                 rm(compareprod);
  161.  
  162.                                 output1<-output;
  163.  
  164.  
  165.                                 names(output1)[names(output1)=="mp_id"] <- "sp_id1";
  166.                                 names(output1)[names(output1)=="sp_id"] <- "mp_id1";
  167.                                 names(output1)[names(output1)=="mp_id1"] <- "mp_id";
  168.                                 names(output1)[names(output1)=="sp_id1"] <- "sp_id";
  169.  
  170.                                 allsimilar<-sqldf("select mp_id, sp_id, score_final
  171.                                 from output
  172.                                 UNION
  173.                                 select mp_id, sp_id, score_final
  174.                                 from output1");
  175.  
  176.                                 rm(output);
  177.                                 rm(output1);
  178.  
  179.                                 if(logger==1) print("final scoring done. top and live checks");
  180.  
  181.                                 liveproducts<-sqlQuery(channel, "SELECT product_offer_group_id as sp_id,0 as dummy FROM dwh.d_catalog_product WHERE
  182.                                 CURDATE() BETWEEN DATE(start_time) AND DATE(end_time) AND catalog_status_id=1");
  183.  
  184.                                 liveproducts<-liveproducts[order(liveproducts$sp_id),];
  185.                    
  186.                                 allsimilar<-allsimilar[order(allsimilar$sp_id),];
  187.  
  188.                                 allsimilar1<-merge(allsimilar, liveproducts,by=c("sp_id"));
  189.  
  190.                                 allsimilar_final1<-sqldf("select mp_id,sp_id,max(score_final) as score_final from allsimilar1 group by 1,2");
  191.  
  192.                                 allsimilar_final<-sqldf("select * from allsimilar_final1 order by mp_id,score_final desc");
  193.  
  194.                                 topsimilar<-ddply(allsimilar_final, "mp_id", function(df) df[1:min(nrow(df), 12),]);
  195.  
  196.                                 topsimilar_final <- subset(topsimilar, select=c(mp_id,sp_id,score_final));
  197.                    
  198.                                 if(logger==1) print("Phod diya");
  199.  
  200.                                 dt2<-as.POSIXct(Sys.time());
  201.  
  202.                                 timedif<-difftime(dt2,dt1,units="secs");
  203.                                 print("process completed in");
  204.                                 print(timedif);
  205.                                 try
  206.                                 {
  207.  
  208.                                     if(catitr==1)
  209.                                     {
  210.                                         write.table(topsimilar_final, file = "similar_output.csv", append=F, sep=",",col.names=T);
  211.                                     } else
  212.                                     {    
  213.                                         write.table(topsimilar_final, file = "similar_output.csv", append=T, sep=",",col.names=F);
  214.                                     }
  215.  
  216.                                 }catch(Exception e) {
  217.                                             }
  218.  
  219.                                 rm(topsimilar);
  220.  
  221.                                 gc();
  222.                             }else
  223.                             {
  224.                                 print("process failure for subcategory id");
  225.                                 print(allsubcategories[catitr,1]);
  226.                             }
  227.                         } catch(Exception e) {
  228.                                     }
  229.                 }else
  230.                 {
  231.                     print("process failure for subcategory id");
  232.                     print(allsubcategories[catitr,1]);
  233.                 }
  234.             }catch(Exception e) {
  235.             }
  236.         } else {}
  237.     }catch(Exception e) {
  238.                 }
  239.    }
  240. }catch(Exception e) {
  241.             }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement