Skip to content

Commit

Permalink
Skip image creation if it exists for sentence, pickle failed attempt …
Browse files Browse the repository at this point in the history
…for spacy objects
  • Loading branch information
aoldoni committed Sep 18, 2016
1 parent 7ec862d commit 0f23de3
Show file tree
Hide file tree
Showing 6 changed files with 101 additions and 37 deletions.
8 changes: 4 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -9,10 +9,10 @@ These scripts help utilising existing tools in the task of information extractin
`git clone https:/aoldoni/comp9596.git .`

- Create directories and prepare assets data:
`mkdir data models parsey stanford training`
`mkdir data/input data/output data/downloaded`
`mkdir data/output/html data/output/ngram data/output/openie data/output/rel`
`cp -R templates/assets data/output/html/assets`
`mkdir data models parsey stanford training data/input data/output data/downloaded data/output/html data/output/ngram data/output/openie data/output/rel data/output/cache`
`cd data/output/html/assets`
`ln -s ../../../templates/assets/ assets`
`cd ../../..`

The next steps depend on what you will be trying to run. In case of MacOS, uou might want to replace some of these steps with using `brew`. Information to installing brew can be found at http://brew.sh/.

Expand Down
2 changes: 2 additions & 0 deletions corpus_analysis.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ def argparser():
help='uses spacy to generate tree graphs')
ap.add_argument('-format', help='format of the tree node accumulator')
ap.add_argument('-behaviour', help='groupby|listing|simplified_groupby')
ap.add_argument('-f', '--force_clean', action='store_true',
help='ignores any caching and forces reprocessing')
return ap

def lemma_search(args):
Expand Down
68 changes: 68 additions & 0 deletions internallib/cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
import sys
import itertools
import os
import pickle

from nltk import Tree

import spacy
import spacy.en

from internallib.directories import *

def get_cached_sentence_image(args, output_path, current_sentence_id, file_extension):
updated_at_date = os.path.getmtime(args.directory + raw_input)
cache_key = args.word.lower() + str(int(updated_at_date))
cache_file = args.directory + output_cache + cache_key

img_name = 'sentence-'+str(current_sentence_id)
img_path = 'images/' + img_name + "." + file_extension

cache_file_final = output_path + 'images/' + img_name + "." + file_extension

if args.force_clean:
return False
else:
return os.path.isfile(cache_file_final)

def get_cached_tokens(args):
sentences = []

can_pickle = False

updated_at_date = os.path.getmtime(args.directory + raw_input)
cache_key = args.word.lower() + str(int(updated_at_date))
cache_file = args.directory + output_cache + cache_key + ".spacy"

# if (os.path.isfile(cache_file) and not args.force_clean and can_pickle):
# with open(cache_file, 'rb') as f:
# sentences = pickle.load(f)
# else:
en_nlp = spacy.load('en')


for fn in os.listdir(args.directory+raw_input):
if (fn == ".DS_Store"):
continue

name = args.directory + raw_input + fn

raw_text = ''

with open(name, 'r') as input:
raw_text = input.read()

if (args.word not in raw_text):
continue

en_doc = en_nlp(raw_text)

for sentence in en_doc.sents:
for token in sentence:
if (token.orth_.lower() == args.word.lower()):
sentences.append( (token, sentence) )

# with open(cache_file, "wb") as f:
# pickle.dump(sentences, f, protocol=pickle.HIGHEST_PROTOCOL)

return sentences
27 changes: 6 additions & 21 deletions internallib/dependency_helpers.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import sys
import itertools
import os
import pickle


from nltk import Tree

import spacy
import spacy.en

from internallib.directories import *
from internallib.cache import get_cached_tokens


import logging, sys
Expand Down Expand Up @@ -163,28 +166,10 @@ def group_sorting(groups):
return newlist

def get_tokens(args):
en_nlp = spacy.load('en')

for fn in os.listdir(args.directory+raw_input):
if (fn == ".DS_Store"):
continue

name = args.directory + raw_input + fn

raw_text = ''

with open(name, 'r') as input:
raw_text = input.read()

if (args.word not in raw_text):
continue

en_doc = en_nlp(raw_text)
sentences = get_cached_tokens(args)

for sentence in en_doc.sents:
for token in sentence:
if (token.orth_.lower() == args.word.lower()):
yield token, sentence
for token, sentence in sentences:
yield token, sentence

def highlight_word(sentence, word):
string_sentence = str(sentence)
Expand Down
1 change: 1 addition & 0 deletions internallib/directories.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
output_rel = 'output/rel/'
output_ngram = 'output/ngram/'
output_html = 'output/html/'
output_cache = 'output/cache/'

raw_input = 'input/'

Expand Down
32 changes: 20 additions & 12 deletions internallib/graph.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@

from internallib.graph_processing import Process

from internallib.cache import get_cached_sentence_image

class CommandAccumulative(object):
def __init__(self, args):
self.args = args
Expand Down Expand Up @@ -155,20 +157,26 @@ def graph_gen_generate(self, accumulator_parents, accumulator_children, id = "")
return 'images/main_image' + id

def sentence_to_graph(self, sentence):
e = Digraph(self.args.word, format='png')
e.attr('node', shape='box')

current_id = self.current_token_id
e.node(str(current_id), sentence.root.orth_)

self.sentence_to_graph_recursive(sentence.root, current_id, e)

img_name = 'sentence-'+str(self.current_sentence_id)
img_path = 'images/' + img_name + "." + self.file_extension
img_dot_path = 'images/' + img_name
img_path = img_dot_path + "." + self.file_extension
self.sentence_imgs.append(img_path)

e.render(self.output_path + 'images/' + img_name)
found = get_cached_sentence_image(self.args, \
self.output_path, \
self.current_sentence_id, \
self.file_extension)

if (not found):
e = Digraph(self.args.word, format=self.file_extension)
e.attr('node', shape='box')

current_id = self.current_token_id
e.node(str(current_id), sentence.root.orth_)
self.sentence_to_graph_recursive(sentence.root, current_id, e)
e.render(self.output_path + img_dot_path)

self.current_sentence_id += 1

return img_path
Expand Down Expand Up @@ -237,7 +245,7 @@ def graph_gen_html(self):
all_imgs_html += each_img_html

t = Template(index)
c = Context({"main_img": "images/main_image.png",
c = Context({"main_img": "images/main_image." + self.file_extension,
"all_sentences": mark_safe(all_imgs_html),
"word": self.args.word})

Expand Down Expand Up @@ -302,7 +310,7 @@ def group_accounting_add(self, tree, token, sentence, img_path):
]}

def gen_group_image(self, token, tree, depth):
e = Digraph(self.args.word, format='png')
e = Digraph(self.args.word, format=self.file_extension)
e.attr('node', shape='box')

current_id = self.current_token_id
Expand Down Expand Up @@ -428,7 +436,7 @@ def run(self):
self.graph_gen_html()

def gen_group_image(self, token, tree, depth):
e = Digraph(self.args.word, format='png')
e = Digraph(self.args.word, format=self.file_extension)
e.attr('node', shape='box')

current_id = self.current_token_id
Expand Down

0 comments on commit 0f23de3

Please sign in to comment.