Skip image creation if it exists for sentence, pickle failed attempt …

…for spacy objects
aoldoni · Sep 18, 2016 · 0f23de3 · 0f23de3
1 parent 7ec862d
commit 0f23de3
Show file tree

Hide file tree

Showing 6 changed files with 101 additions and 37 deletions.
diff --git a/README.md b/README.md
@@ -9,10 +9,10 @@ These scripts help utilising existing tools in the task of information extractin
  `git clone https:/aoldoni/comp9596.git .`
 
 - Create directories and prepare assets data: 
- `mkdir data models parsey stanford training` 
- `mkdir data/input data/output data/downloaded` 
- `mkdir data/output/html data/output/ngram data/output/openie data/output/rel` 
- `cp -R templates/assets data/output/html/assets`
+ `mkdir data models parsey stanford training data/input data/output data/downloaded data/output/html data/output/ngram data/output/openie data/output/rel data/output/cache`
+ `cd data/output/html/assets`
+ `ln -s ../../../templates/assets/ assets`
+ `cd ../../..`
 
 The next steps depend on what you will be trying to run. In case of MacOS, uou might want to replace some of these steps with using `brew`. Information to installing brew can be found at http://brew.sh/.
 

diff --git a/corpus_analysis.py b/corpus_analysis.py
@@ -59,6 +59,8 @@ def argparser():
  help='uses spacy to generate tree graphs')
  ap.add_argument('-format', help='format of the tree node accumulator')
  ap.add_argument('-behaviour', help='groupby|listing|simplified_groupby')
+ ap.add_argument('-f', '--force_clean', action='store_true',
+ help='ignores any caching and forces reprocessing')
  return ap
 
 def lemma_search(args):

diff --git a/internallib/cache.py b/internallib/cache.py
@@ -0,0 +1,68 @@
+import sys
+import itertools
+import os
+import pickle
+
+from nltk import Tree
+
+import spacy
+import spacy.en
+
+from internallib.directories import *
+
+def get_cached_sentence_image(args, output_path, current_sentence_id, file_extension):
+ updated_at_date = os.path.getmtime(args.directory + raw_input)
+ cache_key = args.word.lower() + str(int(updated_at_date))
+ cache_file = args.directory + output_cache + cache_key
+
+ img_name = 'sentence-'+str(current_sentence_id)
+ img_path = 'images/' + img_name + "." + file_extension
+
+ cache_file_final = output_path + 'images/' + img_name + "." + file_extension
+
+ if args.force_clean:
+ return False
+ else:
+ return os.path.isfile(cache_file_final)
+
+def get_cached_tokens(args):
+ sentences = []
+
+ can_pickle = False
+
+ updated_at_date = os.path.getmtime(args.directory + raw_input)
+ cache_key = args.word.lower() + str(int(updated_at_date))
+ cache_file = args.directory + output_cache + cache_key + ".spacy"
+
+ # if (os.path.isfile(cache_file) and not args.force_clean and can_pickle):
+ # with open(cache_file, 'rb') as f:
+ # sentences = pickle.load(f)
+ # else:
+ en_nlp = spacy.load('en')
+
+
+ for fn in os.listdir(args.directory+raw_input):
+ if (fn == ".DS_Store"):
+ continue
+
+ name = args.directory + raw_input + fn
+
+ raw_text = ''
+
+ with open(name, 'r') as input:
+ raw_text = input.read()
+
+ if (args.word not in raw_text):
+ continue
+
+ en_doc = en_nlp(raw_text)
+
+ for sentence in en_doc.sents:
+ for token in sentence:
+ if (token.orth_.lower() == args.word.lower()):
+ sentences.append( (token, sentence) )
+
+ # with open(cache_file, "wb") as f:
+ # pickle.dump(sentences, f, protocol=pickle.HIGHEST_PROTOCOL)
+
+ return sentences
diff --git a/internallib/dependency_helpers.py b/internallib/dependency_helpers.py
@@ -1,13 +1,16 @@
 import sys
 import itertools
 import os
+import pickle
+
 
 from nltk import Tree
 
 import spacy
 import spacy.en
 
 from internallib.directories import *
+from internallib.cache import get_cached_tokens
 
 
 import logging, sys
@@ -163,28 +166,10 @@ def group_sorting(groups):
  return newlist
 
 def get_tokens(args):
- en_nlp = spacy.load('en')
-
- for fn in os.listdir(args.directory+raw_input):
- if (fn == ".DS_Store"):
- continue
-
- name = args.directory + raw_input + fn
-
- raw_text = ''
-
- with open(name, 'r') as input:
- raw_text = input.read()
-
- if (args.word not in raw_text):
- continue
-
- en_doc = en_nlp(raw_text)
+ sentences = get_cached_tokens(args)
 
- for sentence in en_doc.sents:
- for token in sentence:
- if (token.orth_.lower() == args.word.lower()):
- yield token, sentence
+ for token, sentence in sentences:
+ yield token, sentence
 
 def highlight_word(sentence, word):
  string_sentence = str(sentence)

diff --git a/internallib/directories.py b/internallib/directories.py
@@ -16,6 +16,7 @@
 output_rel = 'output/rel/'
 output_ngram = 'output/ngram/'
 output_html = 'output/html/'
+output_cache = 'output/cache/'
 
 raw_input = 'input/'
 

diff --git a/internallib/graph.py b/internallib/graph.py
@@ -20,6 +20,8 @@
 
 from internallib.graph_processing import Process
 
+from internallib.cache import get_cached_sentence_image
+
 class CommandAccumulative(object):
  def __init__(self, args):
  self.args = args
@@ -155,20 +157,26 @@ def graph_gen_generate(self, accumulator_parents, accumulator_children, id = "")
  return 'images/main_image' + id
 
  def sentence_to_graph(self, sentence):
- e = Digraph(self.args.word, format='png')
- e.attr('node', shape='box')
-
- current_id = self.current_token_id
- e.node(str(current_id), sentence.root.orth_)
-
- self.sentence_to_graph_recursive(sentence.root, current_id, e)
 
  img_name = 'sentence-'+str(self.current_sentence_id)
- img_path = 'images/' + img_name + "." + self.file_extension
+ img_dot_path = 'images/' + img_name
+ img_path = img_dot_path + "." + self.file_extension
  self.sentence_imgs.append(img_path)
 
- e.render(self.output_path + 'images/' + img_name)
+ found = get_cached_sentence_image(self.args, \
+ self.output_path, \
+ self.current_sentence_id, \
+ self.file_extension)
+
+ if (not found):
+ e = Digraph(self.args.word, format=self.file_extension)
+ e.attr('node', shape='box')
 
+ current_id = self.current_token_id
+ e.node(str(current_id), sentence.root.orth_)
+ self.sentence_to_graph_recursive(sentence.root, current_id, e)
+ e.render(self.output_path + img_dot_path)
+
  self.current_sentence_id += 1
 
  return img_path
@@ -237,7 +245,7 @@ def graph_gen_html(self):
  all_imgs_html += each_img_html
 
  t = Template(index)
- c = Context({"main_img": "images/main_image.png",
+ c = Context({"main_img": "images/main_image." + self.file_extension,
  "all_sentences": mark_safe(all_imgs_html),
  "word": self.args.word})
 
@@ -302,7 +310,7 @@ def group_accounting_add(self, tree, token, sentence, img_path):
  ]}
 
  def gen_group_image(self, token, tree, depth):
- e = Digraph(self.args.word, format='png')
+ e = Digraph(self.args.word, format=self.file_extension)
  e.attr('node', shape='box')
 
  current_id = self.current_token_id
@@ -428,7 +436,7 @@ def run(self):
  self.graph_gen_html()
 
  def gen_group_image(self, token, tree, depth):
- e = Digraph(self.args.word, format='png')
+ e = Digraph(self.args.word, format=self.file_extension)
  e.attr('node', shape='box')
 
  current_id = self.current_token_id