Advertisement
gur111

Codefights Spam filter question

Sep 25th, 2017
500
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
  1. function reduce(numerator,denominator){
  2.   var gcd = function gcd(a,b){
  3.     return b ? gcd(b, a%b) : a;
  4.   };
  5.   gcd = gcd(numerator,denominator);
  6.   return [numerator/gcd, denominator/gcd];
  7. }
  8.  
  9. function spamDetection(messages, spamSignals) {
  10.     let res = ["passed","","passed","passed"], message, user, msgByUsr = {}, fewerThan5 = 0, contain = 0, contWords = [];
  11.     Array.prototype.unique = function() {
  12.         var a = this.concat();
  13.         for(var i=0; i<a.length; ++i) {
  14.             for(var j=i+1; j<a.length; ++j) {
  15.                 if(a[i] === a[j])
  16.                     a.splice(j--, 1);
  17.             }
  18.         }
  19.  
  20.         return a;
  21.     };
  22.     for(let i = 0; i<messages.length; i++){
  23.         message = messages[i][0];
  24.         user = messages[i][1];
  25.        
  26.         // Check less than 5 words
  27.         if(message.split(" ").length<5) fewerThan5++;
  28.        
  29.         // Check 50% has the same content from the same user
  30.         if(msgByUsr[user] !== undefined){
  31.             if(msgByUsr[user][message] !== undefined) msgByUsr[user][message]++;
  32.             else msgByUsr[user][message] = 1;
  33.         }
  34.         else{
  35.             msgByUsr[user] = {};
  36.             msgByUsr[user][message] = 1;
  37.         }
  38.            
  39.            
  40.            
  41.         // Contain suspicious word
  42.         for(let j = 0; j<spamSignals.length; j++){
  43.             if(message.replace(/[^a-z^A-Z^\ ]/gi, ' ').replace(/[\ \ ]/g, ' ').replace(/[\ \ ]/g, ' ').toLowerCase().split(" ").indexOf(spamSignals[j]) != -1 ){
  44.                 contain++;
  45.                 // if(contWords[spamSignals[j]] === undefined) contWords[spamSignals] = 0
  46.                 contWords.push(spamSignals[j]);
  47.             }
  48.         }
  49.        
  50.        
  51.     }
  52.     console.log("fewer than 5: "+fewerThan5);
  53.     let frac = reduce(fewerThan5, messages.length);
  54.     console.log(frac);
  55.     if(fewerThan5/messages.length > 0.9) res[0] = "failed: "+frac[0]+"/"+frac[1];
  56.    
  57.    
  58.     let amount = 0, max = -1, maxAll = -1, amountAll = 0, maxMsg, maxMsgAll;
  59.     Object.keys(msgByUsr).sort().forEach(function(key1){
  60.         amount = 0; max = -1;
  61.         Object.keys(msgByUsr[key1]).forEach(function(key2){
  62.             amount += msgByUsr[key1][key2];
  63.             if(max < msgByUsr[key1][key2]){
  64.                 max = msgByUsr[key1][key2];
  65.                 maxMsg = key2;
  66.                 if(!maxMsgAll){
  67.                     maxMsgAll = maxMsg;
  68.                     maxAll = 0;
  69.                 }
  70.             }
  71.            
  72.         });
  73.         amountAll += amount;
  74.         if(maxMsgAll === maxMsg){
  75.             maxAll += max;
  76.            
  77.         }
  78.         else{
  79.             if(max > maxAll){
  80.                 maxAll = max;
  81.                 maxMsgAll = maxMsg;
  82.             }
  83.         }
  84.         if(amount >= 2 && max/amount > 0.5) {
  85.             res[1] += " "+key1;
  86.         }
  87.     });
  88.     console.log("Working: "+(messages.length===amountAll));
  89.     if(res[1] !== "") res[1] = "failed:"+res[1];
  90.     else res[1] = "passed";
  91.     console.log("maxAmount: "+maxAll+". Amount: "+amountAll);
  92.     if(amountAll >= 2 && maxAll/amountAll > 0.5)
  93.         res[2] = "failed: "+maxMsgAll;
  94.     console.log("contain: "+contain+". Amount: "+messages.length);
  95.     if(messages.length >= 2 && contain>messages.length/2){
  96.         res[3] = "failed: "+contWords.unique().sort().join(" ");
  97.     }
  98.     return res;
  99. }
  100. //==================== QUESTION ==================
  101. /**
  102.  
  103. Not long ago, a spam campaign originated on some of the major social networks, and it's started to affect Kik users as well. Most of the spam comes from a limited number of highly-motivated individuals, possibly from a single group, who constantly update their spam software. What started off as some simple message-sending bots has now evolved into something that requires a large team of engineers to fight against it.
  104.  
  105. At the beginning, the bots weren't that clever. The spam detection could essentially be narrowed down to checking messages against several simple criteria. For a user's stream of messages over a given time period, the spammer could be identified if:
  106.  
  107.     More than 90 % of all messages had fewer than 5 words (here, a word is defined as a sequence of consecutive letters which is neither immediately preceded nor followed by another letter);
  108.     More than 50 % of messages to any one user had the same content, assuming that there were at least 2 messages to that user;
  109.     More than 50 % of all messages had the same content, assuming that there were at least 2 messages;
  110.     More than 50 % of all messages contained at least one of the words from the given list of spamSignals (the case of the letters doesn't matter).
  111.  
  112. You are applying to the Anti-Spam Team at Kik, so you want to make sure you understand how this basic spam detection program worked. Implement a function that, given a stream of messages and a list of spamSignals, determines whether it's possible that the user might be a spammer by checking against the criteria above.
  113.  
  114. Example
  115.  
  116.     For
  117.  
  118.     messages = [["Sale today!", "2837273"],
  119.                 ["Unique offer!", "3873827"],
  120.                 ["Only today and only for you!", "2837273"],
  121.                 ["Sale today!", "2837273"],
  122.                 ["Unique offer!", "3873827"]]
  123.  
  124.     and spamSignals = ["sale", "discount", "offer"], the output should be
  125.  
  126.     spamDetection(messages, spamSignals) = [
  127.       "passed",
  128.       "failed: 2837273 3873827",
  129.       "passed",
  130.       "failed: offer sale"
  131.     ]
  132.  
  133.     Here are the results of the checks per criterion:
  134.         4 out of 5 (80 %) messages have fewer than five words, which is within acceptable parameters = "passed";
  135.         2 out of 3 messages to user 2837273 are the same and both messages to user 3873827 are the same, which is a good indicator that they might be spam = "failed: 2837273 3873827";
  136.         2 out of 5 (40 %) messages have the same content, which is within acceptable parameters = "passed";
  137.         4 out of 5 (80 %) messages contain words from spamSignals. The two words that appear in the messages are offer and sale and offer is the lexicographically smaller of the two, so the output = "failed: offer sale".
  138.  
  139.     For
  140.  
  141.     messages = [["Check Codefights out", "7284736"],
  142.                 ["Check Codefights out", "7462832"],
  143.                 ["Check Codefights out", "3625374"],
  144.                 ["Check Codefights out", "7264762"]]
  145.  
  146.     and spamSignals = ["sale", "discount", "offer"], the output should be
  147.  
  148.     spamDetection(messages, spamSignals) = [
  149.       "failed: 1/1",
  150.       "passed",
  151.       "failed: Check Codefights out",
  152.       "passed"
  153.     ]
  154.  
  155.     Since all users in messages received only one message each, it's impossible to check the second criterion. The fourth criterion doesn't match: there are not any words from spamSignals in the messages. However, the first and the third criteria failed, since all the messages contain 4 words ("failed: 1/1") and have the exact same content ("failed: Check Codefights out").
  156.  
  157. Input/Output
  158.  
  159.     [time limit] 4000ms (js)
  160.  
  161.     [input] array.array.string messages
  162.  
  163.     An array of messages, where each message is given in the format [message, id of recipient].
  164.  
  165.     Guaranteed constraints:
  166.     1 ≤ messages.length ≤ 100,
  167.     messages[i].length = 2,
  168.     1 ≤ messages[i][0].length ≤ 100,
  169.     1 ≤ int(messages[i][1]) ≤ 109.
  170.  
  171.     [input] array.string spamSignals
  172.  
  173.     An array of unique spam signals, where each spam signal consists of lowercase English letters.
  174.  
  175.     Guaranteed constraints:
  176.     1 ≤ spamSignals.length ≤ 30,
  177.     1 ≤ spamSignals[i].length ≤ 25.
  178.  
  179.     [output] array.string
  180.  
  181.     An array of 4 strings containing the results of the spam checks per criterion. The results for each criterion should be given in the following format:
  182.         "passed" if the check doesn't suggest that the user is a spammer, otherwise:
  183.             for the first criterion: "failed: <failed_ratio>", where <failed_ratio> is the ratio of messages with fewer than 5 words as a reduced fraction;
  184.             for the second criterion: "failed: <recipient_1> <recipient_2> ...", where <recipient_i> is ID of the spammed user. Recipients should be sorted in ascending order of their IDs;
  185.             for the third criterion: "failed: <message>", where <message> is the message, which is equal to more than 50% of all messages;
  186.             for the fourth criterion: "failed: <spamSignal_1> <spamSignal_2> ...", where <spamSignal_i> is the spam signal that appeared in at least one message. Spam signals should be sorted lexicographically.
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement