Advertisement
Not a member of Pastebin yet?
Sign Up,
it unlocks many cool features!
- library("RODBC");
- library("RecordLinkage");
- library("RSQLite");
- library("gsubfn");
- library("proto");
- library("chron");
- library("RSQLite.extfuns");
- library("sqldf");
- library("plyr");
- memory.limit(400000);
- channel<-odbcConnect("replica", "kunalbatra");
- allproducts1<-sqlQuery(channel,"SELECT DISTINCT dc.product_offer_group_id AS id, dp.subcategory_id, f.filter_name,
- fv.value, dp.brand_id, pvpd.pp_price as price, pvpd.pp_selling_price as selling_price, CONVERT(dp.name USING utf8) AS title,0 as price_rank,
- 0 as sp_rank FROM dwh.d_catalog_product dc JOIN dwh.d_product dp ON dp.product_offer_id = dc.id AND dp.subcategory_id is not null and dp.subcategory_id=189 and dp.subcategory_id not in (84, 85, 86) join dwh.lkp_product_vendor_product_details pvpd on pvpd.popm_product_offer_id=dc.id
- left JOIN snapdeal.filter_value fv ON fv.product_offer_id = dp.product_offer_id left JOIN snapdeal.filter f ON f.id = fv.filter_id GROUP BY 1");
- allliveproducts1<-sqlQuery(channel,"SELECT dc.product_offer_group_id as id, dp.subcategory_id, f.filter_name,
- fv.value, dp.brand_id, pvpd.pp_price as price, pvpd.pp_selling_price as selling_price, convert(dp.name using utf8) as title,0 as price_rank,
- 0 as sp_rank FROM dwh.d_catalog_product dc JOIN dwh.d_product dp ON dp.product_offer_id = dc.id and dp.subcategory_id IS NOT NULL and dp.subcategory_id=189 and dp.subcategory_id not in (19,84,85,86,364) join dwh.lkp_product_vendor_product_details pvpd on pvpd.popm_product_offer_id=dc.id
- left JOIN snapdeal.filter_value fv ON fv.product_offer_id = dp.product_offer_id left join snapdeal.filter f on f.id = fv.filter_id where dc.catalog_status_id=1 group by 1
- HAVING CURDATE() BETWEEN MAX(DATE(dc.start_time)) AND MAX(DATE(dc.end_time))");
- allsubcategories<-sqlQuery(channel, "select pc.id from snapdeal.product_category pc where pc.parent_category_id is not null and pc.id=189 and pc.visible=1 and pc.id not in (19,84,85,86,364)");
- similarityvalues<-read.csv("/home/kunalbatra/personalisation_kunal/Similar_products/recommendation_data.csv", header=TRUE);
- filterweights<-read.csv("/home/kunalbatra/personalisation_kunal/Similar_products/filter_weight.csv", header=TRUE);
- try
- {
- for(catitr in 1:nrow(allsubcategories))
- {
- logger = 1;
- if(logger==1) print("entered loop");
- allproducts<-subset(allproducts1,subcategory_id==allsubcategories[catitr,1]);
- if(logger==1) print(catitr);
- if(logger==1) print(allsubcategories[catitr,1]);
- if(logger==1) print(nrow(allproducts));
- if(logger==1) print(Sys.time());
- dt1<-as.POSIXct(Sys.time());
- try
- {
- if(nrow(allproducts)<3000)
- {
- allproducts[,8]<-gsub("[^[:alnum:] ]","",allproducts[,8]);
- allliveproducts<-subset(allliveproducts1,subcategory_id==allsubcategories[catitr,1]);
- allliveproducts[,8]<-gsub("[^[:alnum:] ]","",allliveproducts[,8]);
- if(logger==1) print("query finished. ranking now");
- for(i in 1:nrow(allproducts))
- {
- allproducts[i,9]<-(nrow(allproducts[(allproducts[i,6]>=allproducts$price) & (allproducts[i,2]==allproducts$subcategory_id),])/nrow(allproducts[(allproducts[i,2]==allproducts$subcategory_id),]))*100;
- allproducts[i,10]<-(nrow(allproducts[(allproducts[i,7]>=allproducts$selling_price) & (allproducts[i,2]==allproducts$subcategory_id),])/nrow(allproducts[(allproducts[i,2]==allproducts$subcategory_id),]))*100;
- }
- for(i in 1:nrow(allliveproducts))
- {
- allliveproducts[i,9]<-(nrow(allliveproducts[(allliveproducts[i,6]>=allliveproducts$price) & (allliveproducts[i,2]==allliveproducts$subcategory_id),])/nrow(allliveproducts[(allliveproducts[i,2]==allliveproducts$subcategory_id),]))*100;
- allliveproducts[i,10]<-(nrow(allliveproducts[(allliveproducts[i,7]>=allliveproducts$selling_price) & (allliveproducts[i,2]==allliveproducts$subcategory_id),])/nrow(allliveproducts[(allliveproducts[i,2]==allliveproducts$subcategory_id),]))*100;
- }
- score<-sqldf("select x.*,case when y.Weights is null then x.Similarity*1 else x.Similarity*y.Weights end as final_score
- from similarityvalues x left join filterweights y on x.subcategory_id=y.subcategory and
- x.filter_name=y.filter_name");
- compareprod<-sqldf("SELECT mp.id as mp_id, sp.id as sp_id, sp.subcategory_id, mp.filter_name, mp.value as mp_value, sp.value,case when s.final_score is null then 0 else s.final_score end as final_score
- FROM allproducts mp
- join allliveproducts sp on sp.filter_name = mp.filter_name and sp.subcategory_id = mp.subcategory_id
- left join score s on sp.subcategory_id=s.subcategory_id and s.filter_name=sp.filter_name and mp.value=s.value1 and sp.value=s.value2 where sp.id > mp.id order by 1");
- try
- {
- if(nrow(compareprod)>1)
- {
- attributes<-sqldf("select distinct mp.id as mp_id, sp.id as sp_id, mp.subcategory_id, mp.brand_id as mp_brandID, sp.brand_id,
- case when mp.brand_id=sp.brand_id then x.weights else 0 end as brand_weight, mp.price_rank as mp_priceRank, sp.price_rank, mp.sp_rank as mp_spRank, sp.sp_rank
- FROM allproducts mp join allproducts sp on sp.id > mp.id and sp.subcategory_id = mp.subcategory_id
- left join filterweights x on mp.subcategory_id=x.subcategory and x.filter_name='Brand'");
- attributes1<-sqldf("select distinct a.*,
- x.weights*(1-(abs(a.mp_priceRank-a.price_rank)/100)) as price_weight from
- attributes as a
- left join filterweights as x on a.subcategory_id=x.subcategory and x.filter_name='price'");
- rm(attributes);
- sqldf("create index ix6 on attributes1(subcategory_id)");
- attributes2<-sqldf("select distinct a.*,
- x.weights*(1-(abs(a.mp_spRank-a.sp_rank)/100)) as selling_price_weight
- from attributes1 as a
- left join filterweights as x on a.subcategory_id=x.subcategory and x.filter_name='selling_price'");
- rm(attributes1);
- if(logger==1) print("attributes fat tables done. string comprison started");
- sqldf("create index ix4 on allproducts(subcategory_id)");
- sqldf("create index ix5 on filterweights(subcategory)");
- title_score1<-sqldf("select a.id as mp_id, b.id as sp_id, a.title as a_title, b.title as b_title, c.weights as c_weights
- from allproducts a
- join allproducts b on a.subcategory_id=b.subcategory_id and a.id > b.id
- join filterweights c on b.subcategory_id=c.subcategory and c.filter_name='title'");
- try
- {
- if(nrow(title_score1)>=1)
- {
- title_score1[5:6,"title_score"]<-NA
- if(logger==1) print("attributes fat tables done. levenshtein started");
- title_score1[,6]<-levenshteinSim(title_score1[,3],title_score1[,4]);
- if(logger==1) print("levenshtein finished");
- title_score1[,6]<-(title_score1[,6])*title_score1[,5];
- if(logger==1) print("string comprison done. final scoring started");
- title_score1 <- subset(title_score1, select=c(mp_id,sp_id,title_score));
- names(title_score1)[names(title_score1)=="mp_id"] <- "sp_id1";
- names(title_score1)[names(title_score1)=="sp_id"] <- "mp_id1";
- names(title_score1)[names(title_score1)=="mp_id1"] <- "mp_id";
- names(title_score1)[names(title_score1)=="sp_id1"] <- "sp_id";
- attributes3<-merge(attributes2, title_score1, by= c("mp_id", "sp_id"));
- rm(attributes2);
- rm(title_score1);
- sqldf("create index ix1 on compareprod(mp_id)");
- sqldf("create index ix2 on attributes3(mp_id)");
- output_test_full<-merge(attributes3,compareprod, by=c('mp_id','sp_id'),all=TRUE);
- output_test_full[["final_score"]][is.na(output_test_full[["final_score"]])]<-0;
- output_full_test1<-sqldf("select mp_id,sp_id, sum(final_score) as sum, brand_weight, price_weight, selling_price_weight, title_score from output_test_full group by 1,2");
- output<-sqldf("select mp_id,sp_id,sum, brand_weight,price_weight,selling_price_weight,title_score, (2*sum + brand_weight + 2*price_weight + 2*selling_price_weight + title_score)/
- (1 + price_weight + selling_price_weight + sum) as score_final from output_full_test1 group by 1,2");
- rm(compareprod);
- output1<-output;
- names(output1)[names(output1)=="mp_id"] <- "sp_id1";
- names(output1)[names(output1)=="sp_id"] <- "mp_id1";
- names(output1)[names(output1)=="mp_id1"] <- "mp_id";
- names(output1)[names(output1)=="sp_id1"] <- "sp_id";
- allsimilar<-sqldf("select mp_id, sp_id, score_final
- from output
- UNION
- select mp_id, sp_id, score_final
- from output1");
- rm(output);
- rm(output1);
- if(logger==1) print("final scoring done. top and live checks");
- liveproducts<-sqlQuery(channel, "SELECT product_offer_group_id as sp_id,0 as dummy FROM dwh.d_catalog_product WHERE
- CURDATE() BETWEEN DATE(start_time) AND DATE(end_time) AND catalog_status_id=1");
- liveproducts<-liveproducts[order(liveproducts$sp_id),];
- allsimilar<-allsimilar[order(allsimilar$sp_id),];
- allsimilar1<-merge(allsimilar, liveproducts,by=c("sp_id"));
- allsimilar_final1<-sqldf("select mp_id,sp_id,max(score_final) as score_final from allsimilar1 group by 1,2");
- allsimilar_final<-sqldf("select * from allsimilar_final1 order by mp_id,score_final desc");
- topsimilar<-ddply(allsimilar_final, "mp_id", function(df) df[1:min(nrow(df), 12),]);
- topsimilar_final <- subset(topsimilar, select=c(mp_id,sp_id,score_final));
- if(logger==1) print("Phod diya");
- dt2<-as.POSIXct(Sys.time());
- timedif<-difftime(dt2,dt1,units="secs");
- print("process completed in");
- print(timedif);
- try
- {
- if(catitr==1)
- {
- write.table(topsimilar_final, file = "similar_output.csv", append=F, sep=",",col.names=T);
- } else
- {
- write.table(topsimilar_final, file = "similar_output.csv", append=T, sep=",",col.names=F);
- }
- }catch(Exception e) {
- }
- rm(topsimilar);
- gc();
- }else
- {
- print("process failure for subcategory id");
- print(allsubcategories[catitr,1]);
- }
- } catch(Exception e) {
- }
- }else
- {
- print("process failure for subcategory id");
- print(allsubcategories[catitr,1]);
- }
- }catch(Exception e) {
- }
- } else {}
- }catch(Exception e) {
- }
- }
- }catch(Exception e) {
- }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement