Advertisement
dpitch40

cdiff.c

Jan 29th, 2013
203
0
Never
Not a member of Pastebin yet? Sign Up, it unlocks many cool features!
C 32.64 KB | None | 0 0
  1. // INCLUDE STATEMENTS
  2.  
  3. #include "Python.h"
  4. #include <structmember.h>
  5.  
  6. // MACRO DEFINITIONS
  7.  
  8. #define SAFE_FREE(x) if (x != NULL){free(x);x = NULL;}
  9. #define SAFE_FREE_WITH_DECREF(x, m) { \
  10.     if (x != NULL) { \
  11.         int g; \
  12.         for (g = 0; g < m; g++) { \
  13.             Py_DECREF(x[g]); \
  14.         } \
  15.         free(x); \
  16.         x = NULL; \
  17.     } \
  18. }
  19. #define ASSIGN_MEMBER_NAME(self, x, n) {tmp = self->n; Py_INCREF(x); self->n = x; Py_XDECREF(tmp);}
  20. #define ASSIGN_MEMBER(self, x) ASSIGN_MEMBER_NAME(self, x, x)
  21.  
  22. #define PRINT(x) printf("%s\n", PyString_AsString(PyObject_Str(x)))
  23.  
  24. #define GENERIC_GETTER(attr) {Py_INCREF(self->attr); return self->attr;}
  25. #define TYPE_CHECK(typeChecker, msg) if (value != Py_None && !typeChecker(value)) \
  26.     {PyErr_SetString(PyExc_TypeError, msg); return -1;}
  27.  
  28. #ifndef max
  29.     #define max(a, b) ( ((a) > (b)) ? (a) : (b) )
  30. #endif
  31.  
  32. #ifndef min
  33.     #define min(a, b) ( ((a) < (b)) ? (a) : (b) )
  34. #endif
  35.  
  36. // TYPE DEFINITIONS
  37.  
  38. // Simple struct to get around C's inability to pass arrays between functions.
  39. typedef struct
  40. {
  41.     size_t* arr;
  42.     size_t len;
  43. } ARRAY;
  44.  
  45. // Function for freeing an ARRAY*.
  46. static void free_array(ARRAY* array)
  47. {
  48.     free(array->arr);
  49.     free(array);
  50. }
  51.  
  52. /*
  53.  *  Struct representing a differ to find the longest common subsequence or "diff" of two sequences.
  54.  * Fields:
  55.  *  aIndices            List mapping indices in seqA to indices in the original seqA (before junk filtering)
  56.  *  bIndices            List mapping indices in seqB to indices in the original seqB (before junk filtering)
  57.  *  eqTest              A Python function that, given an element of seqA and one of seqB, returns
  58.  *                      True if they are equal.
  59.  *  junkTest            A Python function that, when called on an element of seqA or seqB, returns
  60.  *                      True if the element is "junk" (to be ignored) and false otherwise.
  61.  *  lcs                 Holds the indices of the longest common subsequence of seqA and seqB.
  62.  *                      Interleaved 1-D array that is twice as long as the LCS. Saved for optimization.
  63.  *  lengths             2-D table created during the LCS algorithm to hold the length of the LCS
  64.  *                      for every possible prefix combination of seqA and seqB. Saved for optimization.
  65.  *                      Simulated using a 1-D array; the row width is seqBLen - matchingPrefixLen
  66.  *                      - matchingPostfixLen + 1 and the column is the same for seqALen.
  67.  *  matchingPostfixLen  The length of the common subsequence at the end of both sequences.
  68.  *  matchingPrefixLen   The length of the common subsequence at the start of both sequences.
  69.  *  rowSetup            A Python function that will be called before each new "row" of the LCS length
  70.  *                      table--that is, before proceeding to compare elements of seqB with a new
  71.  *                      element of seqA, with the seqA element as its argument.
  72.  *  seqA                The first sequence to compare.
  73.  *  seqALen             The number of elements in the first sequence.
  74.  *  seqAList            The original list given to specify seqA.
  75.  *  seqB                The second sequence to compare.
  76.  *  seqBLen             The number of elements in the second sequence.
  77.  *  seqBList            The original list given to specify seqB.
  78.  */
  79. typedef struct
  80. {
  81.     PyObject_HEAD
  82.  
  83.     PyObject* seqAList;
  84.     PyObject** seqA;
  85.     size_t seqALen;
  86.     size_t* aIndices;
  87.  
  88.     PyObject* seqBList;
  89.     PyObject** seqB;
  90.     size_t seqBLen;
  91.     size_t* bIndices;
  92.  
  93.     size_t matchingPrefixLen;
  94.     size_t matchingPostfixLen;
  95.  
  96.     size_t* lengths;
  97.     ARRAY* lcs;
  98.  
  99.     PyObject* eqTest;
  100.     PyObject* junkTest;
  101.     PyObject* rowSetup;
  102.    
  103. } DIFFER;
  104.  
  105. // FUNCTION DEFINITIONS
  106.  
  107. // Resets the calculated LCS information of a DIFFER*.
  108. // Frees and nulls lengths.
  109. // Frees and nulls lcs.
  110. static void reset_differ_lcs(DIFFER* differ)
  111. {
  112.     SAFE_FREE(differ->lengths);
  113.     if (differ->lcs != NULL)
  114.     {
  115.         free_array(differ->lcs);
  116.         differ->lcs = NULL;
  117.     }
  118. }
  119.  
  120.  
  121. // Resets the first sequence of the DIFFER* to be uninitialized, along with all data dependent on it.
  122. // Frees and nulls seqA, decrements the reference count of its members.
  123. // Sets seqALen to 0.
  124. // Frees and nulls aIndices.
  125. // Decrements the reference count of seqAList, nulls it.
  126. // Sets the matchingPrefixLength and matchingPostfixLength to 0.
  127. // Resets the LCS information.
  128. static void reset_differ_seqA(DIFFER* differ)
  129. {
  130.     SAFE_FREE_WITH_DECREF(differ->seqA, differ->seqALen);
  131.     differ->seqALen = 0;
  132.     SAFE_FREE(differ->aIndices);
  133.     if (differ->seqAList != NULL)
  134.     {
  135.         Py_DECREF(differ->seqAList);
  136.         differ->seqAList = NULL;
  137.     }
  138.  
  139.     differ->matchingPrefixLen = 0;
  140.     differ->matchingPostfixLen = 0;
  141.  
  142.     reset_differ_lcs(differ);
  143. }
  144.  
  145. // Resets the second sequence of the DIFFER* to be uninitialized, along with all data dependent on it.
  146. // Frees and nulls seqB, decrements the reference count of its members.
  147. // Sets seqBLen to 0.
  148. // Frees and nulls bIndices.
  149. // Decrements the reference count of seqBList, nulls it.
  150. // Sets the matchingPrefixLength and matchingPostfixLength to 0.
  151. // Resets the LCS information.
  152. static void reset_differ_seqB(DIFFER* differ)
  153. {
  154.     SAFE_FREE_WITH_DECREF(differ->seqB, differ->seqBLen);
  155.     differ->seqBLen = 0;
  156.     SAFE_FREE(differ->bIndices);
  157.     if (differ->seqBList != NULL)
  158.     {
  159.         Py_DECREF(differ->seqBList);
  160.         differ->seqBList = NULL;
  161.     }
  162.  
  163.     differ->matchingPrefixLen = 0;
  164.     differ->matchingPostfixLen = 0;
  165.  
  166.     reset_differ_lcs(differ);
  167. }
  168.  
  169. // Function for freeing a DIFFER* and all of its subfields.
  170. // Resets both sequences of the differ.
  171. // Decrements the reference count of eqTest, junkTest, and rowSetup. (Assumes they are not NULL)
  172. // Calls the tp_free member of the differ's type.
  173. static void free_differ(DIFFER* differ)
  174. {
  175.     reset_differ_seqA(differ);
  176.     reset_differ_seqB(differ);
  177.  
  178.     Py_DECREF(differ->eqTest);
  179.     Py_DECREF(differ->junkTest);
  180.     Py_DECREF(differ->rowSetup);
  181.  
  182.     differ->ob_type->tp_free((PyObject*)differ);
  183. }
  184.  
  185. /*  Filters a Python sequence and returns a filtered list of indices.
  186.  * Inputs:
  187.  *  seq         sequence    The sequence to filter.
  188.  *  junkTest    function    A function that returns true for "junk" elements of the sequence.
  189.  * Outputs:
  190.  *  An allocated ARRAY* of indices corresponding to the non-junk sequence elemebts.
  191. */
  192. static ARRAY* filter_sequence(PyObject* seq, PyObject* junkTest)
  193. {
  194.     ARRAY* indices = malloc(sizeof(ARRAY));
  195.     size_t i;
  196.     size_t seqLength = PyList_Size(seq);
  197.     unsigned int goodCount = 0;
  198.     unsigned char isGood[seqLength];
  199.     PyObject* junkResult;
  200.  
  201.     // Check which entries of the list are junk
  202.     for (i = 0; i < seqLength; i++)
  203.     {
  204.         junkResult = PyObject_CallFunctionObjArgs(junkTest, PyList_GetItem(seq, i), NULL);
  205.         if (PyObject_Not(junkResult))
  206.         {
  207.             goodCount++;
  208.             isGood[i] = 1;
  209.         }
  210.         else
  211.         {
  212.             isGood[i] = 0;
  213.         }
  214.         Py_DECREF(junkResult);
  215.     }
  216.     // Create indices array to return
  217.     indices->len = goodCount;
  218.     size_t* indicesArr = malloc(sizeof(size_t) * goodCount);
  219.     int indI = 0;
  220.     for (i = 0; i < seqLength; i++)
  221.     {
  222.         if (isGood[i])
  223.         {
  224.             indicesArr[indI] = i;
  225.             indI++;
  226.         }
  227.     }
  228.     indices->arr = indicesArr;
  229.     return indices;
  230. }
  231.  
  232. /*  Sets one of the sequences of a Differ to a new value.
  233.  * Inputs:
  234.  *  differ      The differ to assign to.
  235.  *  seq         The new sequence.
  236.  *  a           If 1, the differ's seqA will be overwritten. If 0, seqB wil be overwritten.
  237. */
  238. static void _differ_set_seq(DIFFER* differ, PyObject* newSeq, char a)
  239. {
  240.     int i;
  241.  
  242.     PyObject** seq;
  243.     size_t seqLen;
  244.     size_t* indicesArr;
  245.  
  246.     if (differ->junkTest != Py_None)
  247.     {
  248.         ARRAY* indices = filter_sequence(newSeq, differ->junkTest);
  249.         seqLen = indices->len;
  250.         indicesArr = indices->arr;
  251.         free(indices);
  252.     }
  253.     else
  254.     {
  255.         seqLen = PyList_Size(newSeq);
  256.         indicesArr = malloc(sizeof(size_t) * seqLen);
  257.         for (i = 0; i < seqLen; i++)
  258.         {
  259.             indicesArr[i] = i;
  260.         }
  261.     }
  262.  
  263.     seq = malloc(sizeof(PyObject*) * seqLen);
  264.     for (i = 0; i < seqLen; i++)
  265.     {
  266.         seq[i] = PyList_GetItem(newSeq, i);
  267.         Py_INCREF(seq[i]);
  268.     }
  269.  
  270.     PyObject* tmp;
  271.  
  272.     if (a)
  273.     {
  274.         reset_differ_seqA(differ);
  275.         differ->seqA = seq;
  276.         differ->seqALen = seqLen;
  277.         differ->aIndices = indicesArr;
  278.         ASSIGN_MEMBER_NAME(differ, newSeq, seqAList);
  279.     }
  280.     else
  281.     {
  282.         reset_differ_seqB(differ);
  283.         differ->seqB = seq;
  284.         differ->seqBLen = seqLen;
  285.         differ->bIndices = indicesArr;
  286.         ASSIGN_MEMBER_NAME(differ, newSeq, seqBList);
  287.     }
  288. }
  289.  
  290.  
  291.  
  292.  
  293. //Custom setter/getter for a differ's sequence A that ensures data coherence and checks type
  294. static int py_differ_set_a(DIFFER* self, PyObject* value, void* closure)
  295. {
  296.     TYPE_CHECK(PySequence_Check, "a must be a sequence");
  297.     _differ_set_seq(self, value, 1);
  298.     return 0;
  299. }
  300. static PyObject* py_differ_get_a(DIFFER* self, void* closure) {GENERIC_GETTER(seqAList)}
  301.  
  302. //Custom setter/getter for a differ's sequence B that ensures data coherence and checks type
  303. static int py_differ_set_b(DIFFER* self, PyObject* value, void* closure)
  304. {
  305.     TYPE_CHECK(PySequence_Check, "b must be a sequence");
  306.     _differ_set_seq(self, value, 0);
  307.     return 0;
  308. }
  309. static PyObject* py_differ_get_b(DIFFER* self, void* closure) {GENERIC_GETTER(seqBList)}
  310.  
  311. static int py_differ_set_eqTest(DIFFER* self, PyObject* value, void* closure)
  312. {
  313.     PyObject* tmp;
  314.  
  315.     TYPE_CHECK(PyFunction_Check, "eqTest must be a function");
  316.     ASSIGN_MEMBER_NAME(self, value, eqTest);
  317.     return 0;
  318. }
  319. static PyObject* py_differ_get_eqTest(DIFFER* self, void* closure) {GENERIC_GETTER(eqTest)}
  320.  
  321. static int py_differ_set_junkTest(DIFFER* self, PyObject* value, void* closure)
  322. {
  323.     PyObject* tmp;
  324.  
  325.     TYPE_CHECK(PyFunction_Check, "junkTest must be a function");
  326.     ASSIGN_MEMBER_NAME(self, value, junkTest);
  327.     return 0;
  328. }
  329. static PyObject* py_differ_get_junkTest(DIFFER* self, void* closure) {GENERIC_GETTER(junkTest)}
  330.  
  331. static int py_differ_set_rowSetup(DIFFER* self, PyObject* value, void* closure)
  332. {
  333.     PyObject* tmp;
  334.  
  335.     TYPE_CHECK(PyFunction_Check, "rowSetup must be a function");
  336.     ASSIGN_MEMBER_NAME(self, value, rowSetup);
  337.     return 0;
  338. }
  339. static PyObject* py_differ_get_rowSetup(DIFFER* self, void* closure) {GENERIC_GETTER(rowSetup)}
  340.  
  341. // Initializes a DIFFER* struct with the given Python methods. If None, eqTest defaults to operator.eq.
  342. static void setup_differ(DIFFER* differ, PyObject* eqTest, PyObject* junkTest, PyObject* rowSetup)
  343. {
  344.     differ->seqA = NULL;
  345.     differ->seqALen = 0;
  346.     differ->aIndices = NULL;
  347.     differ->seqAList = NULL;
  348.  
  349.     differ->seqB = NULL;
  350.     differ->seqBLen = 0;
  351.     differ->bIndices = NULL;
  352.     differ->seqBList = NULL;
  353.  
  354.     differ->matchingPrefixLen = 0;
  355.     differ->matchingPostfixLen = 0;
  356.  
  357.     differ->lengths = NULL;
  358.     differ->lcs = NULL;
  359.  
  360.     PyObject* tmp;
  361.  
  362.     if (eqTest != Py_None)
  363.     {
  364.         py_differ_set_eqTest(differ, eqTest, NULL);
  365.     }
  366.     else
  367.     {
  368.         PyObject* modules = PyImport_GetModuleDict();
  369.         PyObject* opMod = PyDict_GetItemString(modules, "operator");
  370.         PyObject* eq = PyObject_GetAttrString(opMod, "eq");
  371.         ASSIGN_MEMBER_NAME(differ, eq, eqTest)
  372.     }
  373.  
  374.     py_differ_set_junkTest(differ, junkTest, NULL);
  375.     py_differ_set_rowSetup(differ, rowSetup, NULL);
  376. }
  377.  
  378. /*  Initializes the Differ object.
  379.  * Python API:
  380.  *  a           sequence    The first sequence to compare.
  381.  *  b           sequence    The second sequence to compare.
  382.  *  eqTest      function    A function taking one element of a and one of b that returns true iff
  383.  *                          they are considered "equal". If None, defaults to operator.eq.
  384.  *  junkTest    function    A function taking one element of a or b that returns True iff it is
  385.  *                          to be considered "junk" and not compared. If None, no elements will be
  386.  *                          considered junk.
  387.  *  rowSetup    function    A function that is called with each successive element of a just before
  388.  *                          it is compared. The element of a that is supplied as the first argument
  389.  *                          of eqTest will not change in between calls to rowSetup. Useful for
  390.  *                          equality tests that can be optimized if only one element being compared
  391.  *                          changes at a time. If None, does not call any row setup function.
  392.  */
  393. static int Differ_init(DIFFER* self, PyObject* args, PyObject* kwds)
  394. {
  395.     PyObject* a;
  396.     PyObject* b;
  397.     PyObject* eqTest;
  398.     PyObject* junkTest;
  399.     PyObject* rowSetup;
  400.  
  401.     if (!PyArg_ParseTuple(args, "OOOOO", &a, &b, &eqTest, &junkTest, &rowSetup))
  402.     {
  403.         return -1;
  404.     }
  405.  
  406.     setup_differ(self, eqTest, junkTest, rowSetup);
  407.     py_differ_set_a(self, a, NULL);
  408.     py_differ_set_b(self, b, NULL);
  409.  
  410.     return 0;
  411. }
  412.  
  413. // Creates and returns a new Differ object
  414. static PyObject* Differ_new(PyTypeObject* type, PyObject* args, PyObject* kwds)
  415. {
  416.     DIFFER* differ = (DIFFER*)type->tp_alloc(type, 0);
  417.     setup_differ(differ, Py_None, Py_None, Py_None);
  418.  
  419.     return (PyObject*)differ;
  420. }
  421.  
  422. // Methods for cyclic garbage collection
  423. static int Differ_traverse(DIFFER* self, visitproc visit, void* arg)
  424. {
  425.     Py_VISIT(self->seqAList);
  426.     Py_VISIT(self->seqBList);
  427.     return 0;
  428. }
  429. static int Differ_clear(DIFFER* self)
  430. {
  431.     Py_CLEAR(self->seqAList);
  432.     Py_CLEAR(self->seqBList);
  433.     return 0;
  434. }
  435.  
  436.  
  437. // LCS ALGORITHM FUNCTIONS
  438.  
  439. // Calculates the length of the LCS of the sequences; fills out the lengths table in the process
  440. // Uses the LCS algorithm from "Introduction to Algorithms", 3rd edition, by Cormen, Leiserson,
  441. // Rivest, and Stein.
  442. // This function is by far the most time-consuming, so optimize it!
  443. static size_t lcs_length(DIFFER* differ)
  444. {
  445.     size_t mpl;
  446.     size_t mpsl;
  447.  
  448.     size_t m;
  449.     size_t n;
  450.  
  451.     size_t mBound;
  452.     size_t nBound;
  453.     size_t last;
  454.  
  455.     size_t i;
  456.     size_t j;
  457.     if (differ->lengths == NULL)
  458.     {
  459.         PyObject** seqA = differ->seqA;
  460.         m = differ->seqALen;
  461.         PyObject** seqB = differ->seqB;
  462.         n = differ->seqBLen;
  463.         int o;
  464.  
  465.         PyObject* eqTest = differ->eqTest;
  466.         PyObject* rowSetup = differ->rowSetup;
  467.  
  468.         PyObject* result;
  469.         char eq;
  470.  
  471.         //Check for matching prefix and postfix
  472.         mpl = 0;
  473.         if (rowSetup != Py_None && m > 0)
  474.         {
  475.             Py_DECREF(PyObject_CallFunctionObjArgs(rowSetup, seqA[0], NULL));
  476.         }
  477.         while (mpl < m)
  478.         {
  479.             result = PyObject_CallFunctionObjArgs(eqTest, seqA[mpl], seqB[mpl], NULL);
  480.             eq = PyObject_IsTrue(result);
  481.             Py_DECREF(result);
  482.             if (!eq)
  483.             {
  484.                 break;
  485.             }
  486.             else
  487.             {
  488.                 mpl++;
  489.                 if (mpl < m && rowSetup != Py_None)
  490.                 {
  491.                     Py_DECREF(PyObject_CallFunctionObjArgs(rowSetup, seqA[mpl], NULL));
  492.                 }
  493.             }
  494.         }
  495.         differ->matchingPrefixLen = mpl;
  496.  
  497.         size_t maxA = m - 1;
  498.         size_t maxB = n - 1;
  499.         m -= mpl;
  500.         n -= mpl;
  501.         o = min(m, n);
  502.  
  503.         mpsl = 0;
  504.         if (rowSetup != Py_None && m > 0)
  505.         {
  506.             Py_DECREF(PyObject_CallFunctionObjArgs(rowSetup, seqA[maxA], NULL));
  507.         }
  508.         while (mpsl < o)
  509.         {
  510.             result = PyObject_CallFunctionObjArgs(eqTest, seqA[maxA - mpsl], seqB[maxB - mpsl], NULL);
  511.             if (PyObject_Not(result))
  512.             {
  513.                 Py_DECREF(result);
  514.                 break;
  515.             }
  516.             else
  517.             {
  518.                 Py_DECREF(result);
  519.                 mpsl++;
  520.                 if (mpsl < o && rowSetup != Py_None)
  521.                 {
  522.                     Py_DECREF(PyObject_CallFunctionObjArgs(rowSetup, seqA[maxA - mpsl], NULL));
  523.                 }
  524.             }
  525.         }
  526.         differ->matchingPostfixLen = mpsl;
  527.  
  528.         m -= mpsl;
  529.         n -= mpsl;
  530.  
  531.         //Height of the table
  532.         mBound = m + 1;
  533.         //Width of the table
  534.         nBound = n + 1;
  535.         last = mBound * nBound - 1;
  536.         PyObject* rowElement;
  537.  
  538.         //Main LCS algorithm
  539.         //Simulated 2D array; lengths[i][j] -> lengths[nBound * i + j]
  540.         size_t* lengths = malloc(sizeof(size_t) * mBound * nBound);
  541.         for (i = 0; i < mBound; i++)
  542.         {
  543.             lengths[i*nBound] = 0;
  544.         }
  545.         for (i = 1; i < nBound; i++)
  546.         {
  547.             lengths[i] = 0;
  548.         }
  549.  
  550.         size_t up = 0;
  551.         size_t current = nBound;
  552.  
  553.         if (m > 0 && n > 0)
  554.         {
  555.             seqA = seqA + mpl;
  556.             seqB = seqB + mpl;
  557.             for (i = 1; i < mBound; i++)
  558.             {
  559.                 lengths[current] = 0;
  560.                 up++;
  561.                 current++;
  562.  
  563.                 rowElement = seqA[i-1];
  564.                 if (rowSetup != Py_None)
  565.                 {
  566.                     Py_DECREF(PyObject_CallFunctionObjArgs(rowSetup, rowElement, NULL));
  567.                 }
  568.                 for (j = 1; j < nBound; j++)
  569.                 {
  570.                     result = PyObject_CallFunctionObjArgs(eqTest, rowElement, seqB[j-1], NULL);
  571.                     eq = PyObject_IsTrue(result);
  572.                     Py_DECREF(result);
  573.                     if (eq)
  574.                     {
  575.                         lengths[current] = lengths[up-1] + 1;
  576.                     }
  577.                     else if (lengths[up] >= lengths[current-1])
  578.                     {
  579.                         lengths[current] = lengths[up];
  580.                     }
  581.                     else
  582.                     {
  583.                         lengths[current] = lengths[current-1];
  584.                     }
  585.                     up++;
  586.                     current++;
  587.                 }
  588.             }
  589.         }
  590.         differ->lengths = lengths;
  591.     }
  592.  
  593.     mpl = differ->matchingPrefixLen;
  594.     mpsl = differ->matchingPostfixLen;
  595.     m = differ->seqALen;
  596.     n = differ->seqBLen;
  597.  
  598.     mBound = m - mpl - mpsl + 1;
  599.     nBound = n - mpl - mpsl + 1;
  600.     last = mBound * nBound - 1;
  601.  
  602.     return differ->lengths[last] + mpl + mpsl;
  603. }
  604.  
  605. // Returns a one-dimensional array with the LCS indices of the two sequences interleaved (that is,
  606. // the indices of the LCS elements in the filtered sequences, first a then b)
  607. static ARRAY* lcs_indices(DIFFER* differ)
  608. {
  609.     size_t i;
  610.     size_t j;
  611.     if (differ->lcs == NULL)
  612.     {
  613.         size_t lcsLen = lcs_length(differ);
  614.  
  615.         ARRAY* lcs = malloc(sizeof(ARRAY));
  616.         lcs->len = lcsLen;
  617.         size_t* lcsArr = malloc(sizeof(size_t) * lcsLen * 2);
  618.         lcs->arr = lcsArr;
  619.  
  620.         if (lcsLen > 0)
  621.         {
  622.             size_t m = differ->seqALen;
  623.             size_t n = differ->seqBLen;
  624.  
  625.             // printf("m=%d\tn=%d\n", m, n);
  626.  
  627.             size_t mpl = differ->matchingPrefixLen;
  628.             size_t mpsl = differ->matchingPostfixLen;
  629.  
  630.             PyObject** seqA = differ->seqA;
  631.             PyObject** seqB = differ->seqB;
  632.  
  633.             size_t* lengths = differ->lengths;
  634.             size_t mBound = m - mpl - mpsl + 1;
  635.             size_t nBound = n - mpl - mpsl + 1;
  636.  
  637.             PyObject* eqTest = differ->eqTest;
  638.             PyObject* rowSetup = differ->rowSetup;
  639.  
  640.             for (i = 0; i < mpl; i++)
  641.             {
  642.                 lcsArr[2 * i] = i;
  643.                 lcsArr[2 * i + 1] = i;
  644.             }
  645.             for (i = 0; i < mpsl; i++)
  646.             {
  647.                 lcsArr[2*(lcsLen - i - 1)] = m - i - 1;
  648.                 lcsArr[2*(lcsLen - i - 1) + 1] = n - i - 1;
  649.             }
  650.             // printf("mpl=%d\tmpsl=%d\n", mpl, mpsl);
  651.  
  652.             size_t lcsIndex = 2*(lcsLen - 1 - mpsl);
  653.             // printf("lcsLen=%d\tlcsIndex=%d\n", lcsLen, lcsIndex);
  654.             i = mBound - 1;
  655.             j = nBound - 1;
  656.             PyObject* result;
  657.             char eq;
  658.             PyObject* rowElement = seqA[mpl+i-1];
  659.  
  660.             if (rowSetup != Py_None)
  661.             {
  662.                 Py_DECREF(PyObject_CallFunctionObjArgs(rowSetup, rowElement, NULL));
  663.             }
  664.             while (i >= 0 && j >= 0 && lcsIndex >= 2*mpl)
  665.             {
  666.                 result = PyObject_CallFunctionObjArgs(eqTest, rowElement, seqB[mpl+j-1], NULL);
  667.                 // printf("i=%d\tj=%d\tlcsI=%d\n", i, j, lcsIndex);
  668.                 eq = PyObject_IsTrue(result);
  669.                 Py_DECREF(result);
  670.                 if (eq)
  671.                 {
  672.                     lcsArr[lcsIndex] = mpl + i - 1;
  673.                     lcsArr[lcsIndex+1] = mpl + j - 1;
  674.                     lcsIndex -= 2;
  675.                     i -= 1;
  676.                     rowElement = seqA[mpl+i-1];
  677.                     if (rowSetup != Py_None)
  678.                     {
  679.                         Py_DECREF(PyObject_CallFunctionObjArgs(rowSetup, rowElement, NULL));
  680.                     }
  681.                     j -= 1;
  682.                 }
  683.                 else if (lengths[(i-1)*nBound + j] >= lengths[i*nBound + j - 1])
  684.                 {
  685.                     i -= 1;
  686.                     rowElement = seqA[mpl+i-1];
  687.                     if (rowSetup != Py_None)
  688.                     {
  689.                         Py_DECREF(PyObject_CallFunctionObjArgs(rowSetup, rowElement, NULL));
  690.                     }
  691.                 }
  692.                 else
  693.                 {
  694.                     j -= 1;
  695.                 }
  696.             }
  697.         }
  698.         differ->lcs = lcs;
  699.     }
  700.     return differ->lcs;
  701. }
  702.  
  703. /*  Python wrapper function for LCS length algorithm.
  704.  * Outputs:
  705.  *  The length of the LCS, as an integer.
  706.  */
  707. static PyObject* py_lcs_length(DIFFER* self)
  708. {
  709.     size_t length = lcs_length(self);
  710.     return Py_BuildValue("I", length);
  711. }
  712.  
  713. /*  Python wrapper function for LCS indices algorithm
  714.  * Outputs:
  715.  *  The indices of the elements of the LCS in each (filtered) sequence, as a list of 2-tuples.
  716.  */
  717. static PyObject* py_lcs_indices(DIFFER* self)
  718. {
  719.     ARRAY* lcs = lcs_indices(self);
  720.     size_t lcsLen = lcs->len;
  721.     size_t* lcsArr = lcs->arr;
  722.  
  723.     PyObject* lcsList = PyList_New(lcsLen);
  724.     size_t i;
  725.     for (i = 0; i < lcsLen; i++)
  726.     {
  727.         PyList_SET_ITEM(lcsList, i, Py_BuildValue("(II)", lcsArr[2*i], lcsArr[2*i+1]));
  728.     }
  729.  
  730.     return lcsList;
  731. }
  732.  
  733. /*  Python function returning the elements of the LCS in a list.
  734.  * Outputs:
  735.  *  A list of the actual elements of the LCS.
  736.  */
  737. static PyObject* py_lcs(DIFFER* self)
  738. {
  739.     ARRAY* lcs = lcs_indices(self);
  740.     size_t* lcsArr = lcs->arr;
  741.     size_t lcsLen = lcs->len;
  742.  
  743.     PyObject* result = PyList_New(lcsLen);
  744.     PyObject** seqA = self->seqA;
  745.     size_t i;
  746.     for (i = 0; i < lcsLen; i++)
  747.     {
  748.         Py_INCREF(seqA[lcsArr[2*i]]);
  749.         PyList_SET_ITEM(result, i, seqA[lcsArr[2*i]]);
  750.     }
  751.     return result;
  752. }
  753.  
  754. // Underlying index conversion method.
  755. static size_t _convert_index(DIFFER* differ, size_t i, char a)
  756. {
  757.     if (a)
  758.     {
  759.         if (i < differ->seqALen)
  760.         {
  761.             return differ->aIndices[i];
  762.         }
  763.         else
  764.         {
  765.             return differ->seqALen;
  766.         }
  767.     }
  768.     else
  769.     {
  770.         if (i < differ->seqBLen)
  771.         {
  772.             return differ->bIndices[i];
  773.         }
  774.         else
  775.         {
  776.             return differ->seqBLen;
  777.         }
  778.     }
  779. }
  780.  
  781. // Converts an index on a differ's filtered sequence A to an index in the original sequence A.
  782. static size_t convert_a_index(DIFFER* differ, size_t i)
  783. {
  784.     return _convert_index(differ, i, 1);
  785. }
  786.  
  787. // Converts an index on a differ's filtered sequence B to an index in the original sequence B.
  788. static size_t convert_b_index(DIFFER* differ, size_t i)
  789. {
  790.     return _convert_index(differ, i, 0);
  791. }
  792.  
  793. // Finds the length of an interval in a diffr's ORIGINAL sequence A, from indices on the filtered
  794. static size_t convert_a_len(DIFFER* differ, size_t i1, size_t i2)
  795. {
  796.     size_t convertedI1 = _convert_index(differ, i1, 1);
  797.     if (i1 != i2)
  798.     {
  799.         return differ->aIndices[i2-1] + 1 - convertedI1;
  800.     }
  801.     else
  802.     {
  803.         return 0;
  804.     }
  805. }
  806.  
  807. // Finds the length of an interval in a diffr's ORIGINAL sequence B, from indices on the filtered
  808. static size_t convert_b_len(DIFFER* differ, size_t i1, size_t i2)
  809. {
  810.     size_t convertedI1 = _convert_index(differ, i1, 0);
  811.     if (i1 != i2)
  812.     {
  813.         return differ->bIndices[i2-1] + 1 - convertedI1;
  814.     }
  815.     else
  816.     {
  817.         return 0;
  818.     }
  819. }
  820.  
  821.  
  822. /*  Returns a list of 3-tuples with information on the "blocks" of the sequences that matched.
  823.  * Outputs:
  824.  *  A list of 3-tuples, one for each matching block, of the form (aStart, bStart, len);
  825.  *  the indices and length are in terms of the original, unfiltered sequences.
  826.  */
  827. static PyObject* py_get_matching_blocks(DIFFER* self)
  828. {
  829.     ARRAY* lcs = lcs_indices(self);
  830.     size_t* lcsArr = lcs->arr;
  831.     size_t lcsLen = lcs->len;
  832.  
  833.     PyObject* result = PyList_New(0);
  834.     if (lcsLen == 0)
  835.     {
  836.         return result;
  837.     }
  838.     size_t i;
  839.     size_t j;
  840.     size_t x;
  841.  
  842.     size_t curBlockI = -1;
  843.     size_t curBlockJ = -1;
  844.     size_t curBlockLength = 0;
  845.  
  846.     for (x = 0; x < lcsLen; x++)
  847.     {
  848.         i = lcsArr[2*x];
  849.         j = lcsArr[2*x+1];
  850.  
  851.         if (curBlockI != -1)
  852.         {
  853.             if (i > curBlockI + curBlockLength || j > curBlockJ + curBlockLength)
  854.             {
  855.                 if (PyList_Append(result, Py_BuildValue("III", convert_a_index(self, curBlockI),
  856.                                                                convert_b_index(self, curBlockJ),
  857.                                                                curBlockLength)) != 0)
  858.                     return NULL;
  859.                 curBlockI = i;
  860.                 curBlockJ = j;
  861.                 curBlockLength = 1;
  862.             }
  863.             else
  864.             {
  865.                 curBlockLength++;
  866.             }
  867.         }
  868.         else
  869.         {
  870.             curBlockI = i;
  871.             curBlockJ = j;
  872.             curBlockLength = 1;
  873.         }
  874.     }
  875.     if (lcsArr[lcsLen*2 - 2] == self->seqALen - 1 && lcsArr[lcsLen*2 -1] == self->seqBLen - 1)
  876.     {
  877.         if (PyList_Append(result, Py_BuildValue("III", convert_a_index(self, curBlockI),
  878.                                                        convert_b_index(self, curBlockJ),
  879.                                                        curBlockLength)) != 0)
  880.             return NULL;
  881.     }
  882.  
  883.     return result;
  884. }
  885.  
  886. /*  Returns a list of 4-tuples with information on the "blocks" of the sequences that didn't match.
  887.  * Outputs:
  888.  *  A list of 4-tuples, one for each differing block, of the form (aStart, aLen, bStart, bLen);
  889.  *  the indices and length are in terms of the original, unfiltered sequences.
  890.  */
  891. static PyObject* py_get_differing_blocks(DIFFER* self)
  892. {
  893.     ARRAY* lcs = lcs_indices(self);
  894.     size_t* lcsArr = lcs->arr;
  895.     size_t lcsLen = lcs->len;
  896.  
  897.     PyObject* result = PyList_New(0);
  898.     if (lcsLen == 0)
  899.     {
  900.         return result;
  901.     }
  902.     size_t i;
  903.     size_t j;
  904.     size_t x;
  905.  
  906.     size_t prevI = -1;
  907.     size_t prevJ = -1;
  908.  
  909.     size_t trueI;
  910.     size_t trueLenI;
  911.     size_t trueJ;
  912.     size_t trueLenJ;
  913.  
  914.     if (lcsArr[0] != 0 || lcsArr[1] != 0)
  915.     {
  916.         trueI = convert_a_index(self, 0);
  917.         trueLenI = convert_a_len(self, 0, lcsArr[0]);
  918.         trueJ = convert_b_index(self, 0);
  919.         trueLenJ = convert_b_len(self, 0, lcsArr[1]);
  920.         printf("Appending (beginning)\n");
  921.         if (PyList_Append(result, Py_BuildValue("IIII", trueI, trueLenI, trueJ, trueLenJ)) != 0)
  922.             return NULL;
  923.     }
  924.  
  925.     for (x = 0; x < lcsLen; x++)
  926.     {
  927.         i = lcsArr[2*x];
  928.         j = lcsArr[2*x+1];
  929.  
  930.         if (prevI != -1 && (i > prevI + 1 || j > prevJ + 1))
  931.         {
  932.             trueI = convert_a_index(self, prevI + 1);
  933.             trueLenI = convert_a_len(self, prevI + 1, i);
  934.             trueJ = convert_b_index(self, prevJ + 1);
  935.             trueLenJ = convert_b_len(self, prevJ + 1, j);
  936.             printf("Appending (loop)\n");
  937.             if (PyList_Append(result, Py_BuildValue("IIII", trueI, trueLenI, trueJ, trueLenJ)) != 0)
  938.                 return NULL;
  939.         }
  940.         prevI = i;
  941.         prevJ = j;
  942.     }
  943.     printf("%d %d %d %d\n", self->seqALen, self->seqBLen, lcsArr[lcsLen*2 - 2], lcsArr[lcsLen*2 - 1]);
  944.     if (lcsArr[lcsLen*2 - 2] < self->seqALen - 1 || lcsArr[lcsLen*2 -1] < self->seqBLen - 1)
  945.     {
  946.         trueI = convert_a_index(self, lcsArr[lcsLen*2-2]+1);
  947.         trueLenI = convert_a_len(self, lcsArr[lcsLen*2-2]+1, self->seqALen);
  948.         trueJ = convert_b_index(self, lcsArr[lcsLen*2-1]+1);
  949.         trueLenJ = convert_b_len(self, lcsArr[lcsLen*2-1]+1, self->seqBLen);
  950.         printf("Appending (end)\n");
  951.         if (PyList_Append(result, Py_BuildValue("IIII", trueI, trueLenI, trueJ, trueLenJ)) != 0)
  952.             return NULL;
  953.     }
  954.  
  955.     return result;
  956. }
  957.  
  958.  
  959. //Getter/setter definitions for Differ class
  960. static PyGetSetDef Differ_getsetters[] = {
  961.     {"a", (getter)py_differ_get_a, (setter)py_differ_set_a, "The first sequence", NULL},
  962.     {"b", (getter)py_differ_get_b, (setter)py_differ_set_b, "The second sequence", NULL},
  963.     {"eqTest", (getter)py_differ_get_eqTest, (setter)py_differ_set_eqTest,
  964.         "The equality test (2-argument)", NULL},
  965.     {"junkTest", (getter)py_differ_get_junkTest, (setter)py_differ_set_junkTest,
  966.         "The junk test (1-argument)", NULL},
  967.     {"rowSetup", (getter)py_differ_get_rowSetup, (setter)py_differ_set_rowSetup,
  968.         "The row setup function (1-argument)", NULL},
  969.     {NULL}
  970. };
  971.  
  972. //Other member variable definitions for Differ
  973. static PyMemberDef Differ_members[] = {
  974.     {NULL}
  975. };
  976.  
  977. //Object methods for Differ
  978. static PyMethodDef Differ_methods[] = {
  979.     {"LCSLength", (PyCFunction)py_lcs_length, METH_NOARGS, NULL},
  980.     {"LCSIndices", (PyCFunction)py_lcs_indices, METH_NOARGS, NULL},
  981.     {"LCS", (PyCFunction)py_lcs, METH_NOARGS, NULL},
  982.     {"GetMatchingBlocks", (PyCFunction)py_get_matching_blocks, METH_NOARGS, NULL},
  983.     {"GetDifferingBlocks", (PyCFunction)py_get_differing_blocks, METH_NOARGS, NULL},
  984.     {NULL}
  985. };
  986.  
  987. //Module methods for _diff
  988. static PyMethodDef module_methods[] = {
  989.     {NULL}
  990. };
  991.  
  992. // Differ type definition
  993. static PyTypeObject DIFFER_Type = {
  994.     PyObject_HEAD_INIT(NULL)
  995.     0,                         /*ob_size*/
  996.     "diff.Differ",             /*tp_name*/
  997.     sizeof(DIFFER),            /*tp_basicsize*/
  998.     0,                         /*tp_itemsize*/
  999.     (destructor)free_differ,   /*tp_dealloc*/
  1000.     0,                         /*tp_print*/
  1001.     0,                         /*tp_getattr*/
  1002.     0,                         /*tp_setattr*/
  1003.     0,                         /*tp_compare*/
  1004.     0,                         /*tp_repr*/
  1005.     0,                         /*tp_as_number*/
  1006.     0,                         /*tp_as_sequence*/
  1007.     0,                         /*tp_as_mapping*/
  1008.     0,                         /*tp_hash */
  1009.     0,                         /*tp_call*/
  1010.     0,                         /*tp_str*/
  1011.     0,                         /*tp_getattro*/
  1012.     0,                         /*tp_setattro*/
  1013.     0,                         /*tp_as_buffer*/
  1014.     Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE | Py_TPFLAGS_HAVE_GC, /*tp_flags*/
  1015.     "Differ objects",          /* tp_doc */
  1016.     (traverseproc)Differ_traverse, /* tp_traverse */
  1017.     (inquiry)Differ_clear,     /* tp_clear */
  1018.     0,                         /* tp_richcompare */
  1019.     0,                         /* tp_weaklistoffset */
  1020.     0,                         /* tp_iter */
  1021.     0,                         /* tp_iternext */
  1022.     Differ_methods,            /* tp_methods */
  1023.     Differ_members,            /* tp_members */
  1024.     Differ_getsetters,         /* tp_getset */
  1025.     0,                         /* tp_base */
  1026.     0,                         /* tp_dict */
  1027.     0,                         /* tp_descr_get */
  1028.     0,                         /* tp_descr_set */
  1029.     0,                         /* tp_dictoffset */
  1030.     (initproc)Differ_init,     /* tp_init */
  1031.     0,                         /* tp_alloc */
  1032.     Differ_new,                /* tp_new */
  1033. };
  1034.  
  1035.  
  1036.  
  1037. #ifndef PyMODINIT_FUNC  /* declarations for DLL import/export */
  1038. #define PyMODINIT_FUNC void
  1039. #endif
  1040.  
  1041. PyMODINIT_FUNC initcdiff(void)
  1042. {
  1043.     PyObject* m;
  1044.  
  1045.     if (PyType_Ready(&DIFFER_Type) < 0)
  1046.         return;
  1047.  
  1048.     //Import statements
  1049.     PyImport_ImportModule("operator");    
  1050.  
  1051.     m = Py_InitModule("cdiff", module_methods);
  1052.     if (m == NULL)
  1053.     {
  1054.         return;
  1055.     }
  1056.  
  1057.     Py_INCREF(&DIFFER_Type);
  1058.     PyModule_AddObject(m, "Differ", (PyObject*)&DIFFER_Type);
  1059.  
  1060. }
Advertisement
Add Comment
Please, Sign In to add comment
Advertisement