Untitled

#!/usr/bin/python3

__all__ = ["summarize", "humanize", "summarize_for_parsing"]

import re
import sys
import nltk

#-------------------------------------------------------------------------------
stmts = [
  # "pet 10 cats and 2 rabbits",
  # "what's the weather like in california",
  # "At eight o'clock on Thursday morning ... Arthur didn't feel very good.",
  # "If you publish work that uses NLTK, please cite the NLTK book as follows:",

  """I love the justifications that people make for why schools exist. They never want to say what the real reason is (to 'civilise' and 'properly instruct' children towards the hegemonic ideal of a 'citizen', which is a largely colonial or imperial construct... regardless of where you're located).""",
  """Origin of Yom Kippur: Not Moses, but a Murder in the Temple?

While tradition says that observance of the Day of Atonement was initiated by Moses, careful reading of the Bible indicates that the holiday was a much later contrivance.

Yom Kippur, according to tradition, was first observed in the Sinai Desert, during the Israelites' journey from Egypt to the Promised Land, which some sources date to about 1440 B.C.E. But a careful reading of the Bible shows that the Day of Atonement must have been established later, much later – quite possibly, around 400 B.C.E.""",
  """Vegan diets can contain more UPFs on average – but this might not be a problem.

For most people living in industrialised societies, it's difficult to avoid ultra-processed foods altogether. My cereal is ultra-processed. So is the whiskey in my cabinet, the hot sauce in my fridge and the crisps in my backpack.

The term "ultra-processed" is poorly understood and inconsistently used, even sometimes by scientists. While in some circles it has become a catch-all term for foods with little nutritional benefit, a wide variety of foods fall under this umbrella.

Ultra-processed foods are popular with consumers for their convenience (frozen pizza), taste (wrapped cookies), and durability (sandwich bread). These elements, plus the relatively low cost of ingredients, make them profitable for manufacturers.

But recently another motivation for ultra-processed foods has emerged: to replace meat or dairy among those attempting to eat a more plant-based diet. With this new category has come anxiety about the health effects of these products, leading to headlines such as "The unhealthiest fake meats you can buy (and why it's better to go to McDonald's)". These concerns were exacerbated by recent research, which found that those who consume 10% more ultra-processed foods derived from plants have a 12% higher risk of death related to diet. However, things are not quite as they seem. Are plant-based diets really so rich in ultra-processed foods, and are they any worse for you? """,
  "disassemble 10 first instructions at main",
  "Count the number of functions in this database",
  "Kindly find me all the functions that call GetProcAddress, please",
  "Find callers of GetProcAddress that don't call LoadLibrary",
  "undo the last command"
]

PUNCTUATION = [".", ",", ":", ";", "...", "?", "!", "-"]
INTERESTING = ["JJ", "CD", "NN", "VB", "RB", "IN", "PRP", "WRB", "CC", "(", ")"]
INTERESTING.extend(list(PUNCTUATION))

INTERESTING_PARSE = ["JJ", "CD", "NN", "RB", "VB", "IN", "WRB"]

IGNORE_POLITENESS = ["kindly", "please"]

#-------------------------------------------------------------------------------
def clean_spaces(text):
  return re.sub('\s{2,}', ' ', text)

#-------------------------------------------------------------------------------
def humanize(summary):
  ret = []
  for item in summary:
    ret.append(item[0])
  tmp = " ".join(ret)
  return tmp.capitalize()

#-------------------------------------------------------------------------------
def nice_sentence(summary):
  ret = ""
  line_start = True
  open_parentheses = False
  capitalize_next = False
  for item in summary:
    current = item[0]

    if capitalize_next:
      current = current.capitalize()
      capitalize_next = False

    if line_start:
      current = current.capitalize()
      line_start = False
    elif open_parentheses:
      open_parentheses = False
    elif current == ")":
      ret = ret.rstrip(" ")
    elif current not in PUNCTUATION:
      ret += " "

    ret += current

    if current == "(":
      open_parentheses = True
    elif current.endswith("."):
      capitalize_next = True

  return ret.replace(". ", ".\n").strip(" ")

#-------------------------------------------------------------------------------
def summarize_for_parsing(summary):
  ret = []
  for item in summary:
    if item[0].lower() in IGNORE_POLITENESS:
      continue
    x = item[1]
    for key in INTERESTING_PARSE:
      if x.startswith(key):
        ret.append(item[0])
  return ret

#-------------------------------------------------------------------------------
def summarize(sentence):
  tokens = nltk.word_tokenize(sentence)
  tagged = nltk.pos_tag(tokens)
  entities = nltk.chunk.ne_chunk(tagged)

  new_entities = []
  for ent in entities:
    if type(ent[0]) is not str:
      token_str  = " ".join([ x[0] for x in ent ])
      token_type = ent[0][1]
      ent = (token_str, token_type)
    new_entities.append(ent)

  summary = []
  for ent in new_entities:
    for inte in INTERESTING:
      item = list(ent)
      if len(item) < 2:
        ent = item[0]

      current_item = list(ent)[0]
      if list(ent)[1].startswith(inte):
        if ent[0].startswith("'"):
          continue
        summary.append(ent)

  return list(entities), list(summary)

#-------------------------------------------------------------------------------
def test():
  for stmt in stmts:
    entities, summary = summarize(stmt)
    print(">Sentence    :", repr(stmt))
    print(">Entities    :", list(entities))
    print(">Summary     :", list(summary))
    print(">For machines:", summarize_for_parsing(summary))
    print(">Humanized :", humanize(summary))
    print(">Niced     :", nice_sentence(summary))
    print()

#-------------------------------------------------------------------------------
def main(filename):
  with open(filename, "r") as f:
    stmt = clean_spaces( f.read() )

  original_size = len(stmt)
  entities, summary = summarize(stmt)
  parsing = summarize_for_parsing(summary)
  human = humanize(summary)
  nice = nice_sentence(summary)

  size = len(nice)
  new_size = (size * 100) / original_size
  print(f"Size is {size}, original {original_size}, reduced to {new_size}%:\n\n{nice}\n")

if __name__ == "__main__":
  test()
  sys.exit(0)
  if len(sys.argv) == 1:
    print(f"Usage: {sys.argv[0]} <filename>")
  else:
    main(sys.argv[1])