From a952a62233d454934971596f692dadf40b0c7a22 Mon Sep 17 00:00:00 2001 From: Pushkar-Bhuse Date: Fri, 22 Jul 2022 12:30:46 -0700 Subject: [PATCH 01/20] Storing entry types in type attributes --- forte/common/constants.py | 8 + forte/data/base_pack.py | 56 +-- forte/data/data_store.py | 190 ++++++++-- .../data/ontology/ontology_code_generator.py | 14 +- .../data/data_store_serialization_test.py | 349 +++++++++++------- tests/forte/data/data_store_test.py | 61 +-- 6 files changed, 464 insertions(+), 214 deletions(-) diff --git a/forte/common/constants.py b/forte/common/constants.py index db28078ff..ac90f7d35 100644 --- a/forte/common/constants.py +++ b/forte/common/constants.py @@ -38,6 +38,14 @@ # ``_type_attributes`` of ``DataStore``. TYPE_ATTR_KEY = "attributes" +# Name of the key to access the type of an attribute from +# ``_type_attributes`` of ``DataStore``. +ATTR_TYPE_KEY = "type" + +# Name of the key to access the index of an attribute from +# ``_type_attributes`` of ``DataStore``. +ATTR_INDEX_KEY = "index" + # Name of the key to access a set of parent names of an entry type from # ``_type_attributes`` of ``DataStore``. PARENT_CLASS_KEY = "parent_class" diff --git a/forte/data/base_pack.py b/forte/data/base_pack.py index a519ab7fd..b3513139d 100644 --- a/forte/data/base_pack.py +++ b/forte/data/base_pack.py @@ -20,6 +20,7 @@ from abc import abstractmethod from pathlib import Path from typing import ( + ForwardRef, List, Optional, Set, @@ -32,7 +33,6 @@ Iterable, ) from functools import partial -from typing_inspect import get_origin from packaging.version import Version import jsonpickle @@ -455,7 +455,7 @@ def on_entry_creation( # Use the auto-inferred control component. c = self.__control_component - def entry_getter(cls: Entry, attr_name: str, field_type): + def entry_getter(cls: Entry, attr_name: str): """A getter function for dataclass fields of entry object. When the field contains ``tid``s, we will convert them to entry object on the fly. @@ -463,24 +463,37 @@ def entry_getter(cls: Entry, attr_name: str, field_type): Args: cls: An ``Entry`` class object. attr_name: The name of the attribute. - field_type: The type of the attribute. """ + data_store_ref = ( cls.pack._data_store # pylint: disable=protected-access ) attr_val = data_store_ref.get_attribute( tid=cls.tid, attr_name=attr_name ) - if field_type in (FList, FDict): + entry_type = data_store_ref.get_entry_types( + cls.entry_type(), attr_name + ) + + if entry_type[0] in (FList, FDict): # Generate FList/FDict object on the fly - return field_type(parent_entry=cls, data=attr_val) + return entry_type[0](parent_entry=cls, data=attr_val) try: - # TODO: Find a better solution to determine if a field is Entry - # Will be addressed by https://github.com/asyml/forte/issues/835 - # Convert tid to entry object on the fly - if isinstance(attr_val, int): - # Single pack entry + # Check dataclass attribute value type + # If the attribute was an Entry object, only its tid + # is stored in the DataStore and hence its needs to be converted. + if entry_type[1] and ( + any( + isinstance(entry, ForwardRef) + for entry in list(entry_type[1]) + ) + or any( + issubclass(entry, (Entry)) + for entry in list(entry_type[1]) + ) + ): return cls.pack.get_entry(tid=attr_val) + # The condition below is to check whether the attribute's value # is a pair of integers - `(pack_id, tid)`. If so we may have # encountered a `tid` that can only be resolved by @@ -497,7 +510,7 @@ def entry_getter(cls: Entry, attr_name: str, field_type): pass return attr_val - def entry_setter(cls: Entry, value: Any, attr_name: str, field_type): + def entry_setter(cls: Entry, value: Any, attr_name: str): """A setter function for dataclass fields of entry object. When the value contains entry objects, we will convert them into ``tid``s before storing to ``DataStore``. @@ -506,16 +519,20 @@ def entry_setter(cls: Entry, value: Any, attr_name: str, field_type): cls: An ``Entry`` class object. value: The value to be assigned to the attribute. attr_name: The name of the attribute. - field_type: The type of the attribute. """ attr_value: Any data_store_ref = ( cls.pack._data_store # pylint: disable=protected-access ) + + entry_type = data_store_ref.get_entry_types( + cls.entry_type(), attr_name + ) + # Assumption: Users will not assign value to a FList/FDict field. # Only internal methods can set the FList/FDict field, and value's # type has to be Iterator[Entry]/Dict[Any, Entry]. - if field_type is FList: + if entry_type[0] is FList: try: attr_value = [entry.tid for entry in value] except AttributeError as e: @@ -523,7 +540,7 @@ def entry_setter(cls: Entry, value: Any, attr_name: str, field_type): "You are trying to assign value to a `FList` field, " "which can only accept an iterator of `Entry` objects." ) from e - elif field_type is FDict: + elif entry_type[0] is FDict: try: attr_value = { key: entry.tid for key, entry in value.items() @@ -554,10 +571,9 @@ def entry_setter(cls: Entry, value: Any, attr_name: str, field_type): self._save_entry_to_data_store(entry=entry) # Register property functions for all dataclass fields. - for name, field in entry.__dataclass_fields__.items(): + for name, _ in entry.__dataclass_fields__.items(): # Convert the typing annotation to the original class. # This will be used to determine if a field is FList/FDict. - field_type = get_origin(field.type) setattr( type(entry), name, @@ -566,12 +582,8 @@ def entry_setter(cls: Entry, value: Any, attr_name: str, field_type): property( # We need to bound the attribute name and field type here # for the getter and setter of each field. - fget=partial( - entry_getter, attr_name=name, field_type=field_type - ), - fset=partial( - entry_setter, attr_name=name, field_type=field_type - ), + fget=partial(entry_getter, attr_name=name), + fset=partial(entry_setter, attr_name=name), ), ) diff --git a/forte/data/data_store.py b/forte/data/data_store.py index ec20a7407..e38ed01f0 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -12,13 +12,13 @@ # limitations under the License. import json -from typing import Dict, List, Iterator, Tuple, Optional, Any, Type +from typing import Dict, List, Iterator, Set, Tuple, Optional, Any, Type import uuid import logging from heapq import heappush, heappop from sortedcontainers import SortedList -from typing_inspect import get_origin +from typing_inspect import get_origin, get_args from forte.utils import get_class from forte.utils.utils import get_full_module_name @@ -161,9 +161,11 @@ def __init__( ``type_name``, their parent entry, and the order of corresponding attributes. The keys are fully qualified names of every type; The value is a dictionary with two keys. Key ``attribute`` provides an inner dictionary - with all valid attributes for this type and the indices of attributes - among these lists. Key ``parent_class`` is a string representing the - ancestors of this type. + with all valid attributes for this type and the information of attributes + among these lists. This is information is represented as a dictionary. The + information represented in this dictionary is the index of the attribute + and the type of the variable it stores. Key ``parent_class`` is a string + representing the ancestors of this type. This structure is supposed to be built dynamically. When a user adds new entries, `DataStore` will check unknown types and add them to @@ -175,20 +177,30 @@ def __init__( # DataStore._type_attributes is: # { - # "ft.onto.base_ontology.Token": { - # "attributes": {"pos": 4, "ud_xpos": 5, - # "lemma": 6, "chunk": 7, "ner": 8, "sense": 9, - # "is_root": 10, "ud_features": 11, "ud_misc": 12}, - # "parent_class": set("forte.data.ontology.top.Annotation"), }, - # "ft.onto.base_ontology.Document": { - # "attributes": {"document_class": 4, - # "sentiment": 5, "classifications": 6}, - # "parent_class": set("forte.data.ontology.top.Annotation"), }, - # "ft.onto.base_ontology.Sentence": { - # "attributes": {"speaker": 4, - # "part_id": 5, "sentiment": 6, - # "classification": 7, "classifications": 8}, - # "parent_class": set(), } + # "ft.onto.base_ontology.Document": { + # "attributes": { + # "document_class": {"index": 4, "type": (list, (str,))}, + # "sentiment": {"index": 5, "type": (dict, (str, float))}, + # "classifications": { + # "index": 6, + # "type":(FDict,(str, Classification)) + # } + # }, + # "parent_class": set(), + # }, + # "ft.onto.base_ontology.Sentence": { + # "attributes": { + # "speaker": {"index": 4, "type": (Union, (str, type(None)))}, + # "part_id": {"index": 5, "type": (Union, (int, type(None)))}, + # "sentiment": {"index": 6, "type": (dict, (str, float))}, + # "classification": {"index": 7, "type": (dict, (str, float))}, + # "classifications": { + # "index": 8, + # "type": (FDict,(str, Classification)) + # }, + # }, + # "parent_class": set(), + # }, # } """ self._init_top_to_core_entries() @@ -262,11 +274,16 @@ def __getstate__(self): for k in self.__elements: # build the full `_type_attributes` self._get_type_info(k) + for _, info in self._type_attributes[k][ + constants.TYPE_ATTR_KEY + ].items(): + info.pop(constants.ATTR_TYPE_KEY) state["_DataStore__elements"][k] = list(self.__elements[k]) state.pop("_DataStore__tid_ref_dict") state.pop("_DataStore__tid_idx_dict") state.pop("_DataStore__deletion_count") state["entries"] = state.pop("_DataStore__elements") + state["fields"] = self._type_attributes for _, v in state["fields"].items(): if constants.PARENT_CLASS_KEY in v: @@ -287,6 +304,20 @@ def __setstate__(self, state): self._DataStore__tid_idx_dict = {} self._DataStore__deletion_count = {} + # Update `_type_attributes` to store the types of each + # entry attribute as well. + for tn in self._type_attributes: + entry_type = self._add_entry_types(tn) + for attr, type_val in entry_type.items(): + try: + info_dict = self._type_attributes[tn][ + constants.TYPE_ATTR_KEY + ][attr] + except KeyError: + continue + if constants.ATTR_TYPE_KEY not in info_dict: + info_dict[constants.ATTR_TYPE_KEY] = type_val + reset_index = {} for k in self.__elements: if self._is_annotation(k): @@ -395,9 +426,19 @@ def check_fields(store): # If a field only occurs in the serialized object but not in # the current class, it will not be detected. # Instead, it will be dropped later. - diff = set(v[constants.TYPE_ATTR_KEY].items()) - set( - store._type_attributes[t][constants.TYPE_ATTR_KEY].items() + + # This lambda function is used to get a temporary + # representation of type_attributes with only the + # name and index + get_temp_rep = lambda entry: set( + (attr, val[constants.ATTR_INDEX_KEY]) + for attr, val in entry[constants.TYPE_ATTR_KEY].items() ) + + temp_cls_rep = get_temp_rep(v) + temp_obj_rep = get_temp_rep(store._type_attributes[t]) + + diff = temp_cls_rep - temp_obj_rep for f in diff: # if fields appear in both the current class and the # serialized objects but have different orders, switch @@ -410,7 +451,7 @@ def check_fields(store): # objects. Save different indices to a dictionary. change_map[f[1]] = store._type_attributes[t][ constants.TYPE_ATTR_KEY - ][f[0]] + ][f[0]][constants.ATTR_INDEX_KEY] # record indices of fields that only appear in the # current class. We want to fill them with None. else: @@ -433,7 +474,13 @@ def check_fields(store): # throw fields that are redundant/only appear in # the serialized object for i in range( - max(v[constants.TYPE_ATTR_KEY].values()) + 1 + max( + info[constants.ATTR_INDEX_KEY] + for info in v[ + constants.TYPE_ATTR_KEY + ].values() + ) + + 1 ) ] if len(contradict_loc) > 0: @@ -538,8 +585,13 @@ def _get_type_info(self, type_name: str) -> Dict[str, Any]: attr_dict = {} attr_idx = constants.ENTRY_TYPE_INDEX + 1 + type_dict = self._add_entry_types(type_name) + for attr_name in attributes: - attr_dict[attr_name] = attr_idx + attr_dict[attr_name] = { + constants.ATTR_TYPE_KEY: type_dict[attr_name], + constants.ATTR_INDEX_KEY: attr_idx, + } attr_idx += 1 new_entry_info = { @@ -547,9 +599,10 @@ def _get_type_info(self, type_name: str) -> Dict[str, Any]: constants.PARENT_CLASS_KEY: set(), } DataStore._type_attributes[type_name] = new_entry_info + return new_entry_info - def _get_type_attribute_dict(self, type_name: str) -> Dict[str, int]: + def _get_type_attribute_dict(self, type_name: str) -> Dict[str, Dict]: """Get the attribute dict of an entry type. The attribute dict maps attribute names to a list of consecutive integers as indices. For example: .. code-block:: python @@ -593,18 +646,54 @@ def _default_attributes_for_type(self, type_name: str) -> List: attr_dict (list): A list of attributes with default values. """ attr_dict: Dict = self._get_type_attribute_dict(type_name) - attr_fields: Dict = self._get_entry_attributes_by_class(type_name) attr_list: List = [None] * len(attr_dict) - for attr_name, attr_id in attr_dict.items(): + for attr_name, attr_info in attr_dict.items(): # TODO: We should keep a record of the attribute class instead of # inspecting the class on the fly. - attr_class = get_origin(attr_fields[attr_name].type) + attr_id = attr_info[constants.ATTR_INDEX_KEY] + + attr_class = attr_dict[attr_name][constants.ATTR_TYPE_KEY][0] if attr_class in (FList, list, List): attr_list[attr_id - constants.ATTR_BEGIN_INDEX] = [] elif attr_class in (FDict, dict, Dict): attr_list[attr_id - constants.ATTR_BEGIN_INDEX] = {} return attr_list + def _add_entry_types( + self, type_name: str, attributes: Optional[Set[Tuple[str, str]]] = None + ) -> Dict: + r"""This function takes a fully qualified ``type_name`` class name, + adds the type of all its dataclass attributes to the + `_entry_type_dict` dictionary class variable. + + Args: + type_name: A fully qualified name of an entry class. + attributes: This argument is used when parsing ontology + files. The entries in the set are a tuples of two + elements. + .. code-block:: python + + attributes = { + ('passage_id', 'str'), + ('author', 'str') + } + """ + type_dict = {} + + if attributes: + for attr, type_val in attributes: + type_dict[attr] = (type(None), (get_class(type_val),)) + + else: + attr_fields: Dict = self._get_entry_attributes_by_class(type_name) + for attr_name, attr_info in attr_fields.items(): + attr_class = get_origin(attr_info.type) + attr_args = get_args(attr_info.type) + + type_dict[attr_name] = (attr_class, attr_args) + + return type_dict + def _is_subclass( self, type_name: str, cls, no_dynamic_subclass: bool = False ) -> bool: @@ -691,6 +780,33 @@ def _is_annotation(self, type_name: str) -> bool: for entry_class in (Annotation, AudioAnnotation) ) + def get_entry_types( + self, type_name: str, attr_name: str + ) -> Tuple[Any, Tuple]: + """ + Retrieve the entry type of a given attribute ``attr_name`` + in an entry of type ``type_name`` + + Args: + type_name (str): The type name of the entry whose attribute enty + type needs to be fetched + attr_name (str): The name of the attribute in the entry whose type + information needs to be fetched. + + Returns: + The type information of the required attribute. This infromation is + stored in the ``_type_attributes`` dictionary of the Data Store. + """ + try: + return self._type_attributes[type_name][constants.TYPE_ATTR_KEY][ + attr_name + ][constants.ATTR_TYPE_KEY] + except KeyError as e: + raise KeyError( + f"Attribute {attr_name} does not have type " + "information provided" + ) from e + def all_entries(self, entry_type_name: str) -> Iterator[List]: """ Retrieve all entry data of entry type ``entry_type_name`` and @@ -948,7 +1064,9 @@ def set_attribute(self, tid: int, attr_name: str, attr_value: Any): entry, entry_type = self.get_entry(tid) try: - attr_id = self._get_type_attribute_dict(entry_type)[attr_name] + attr_id = self._get_type_attribute_dict(entry_type)[attr_name][ + constants.ATTR_INDEX_KEY + ] except KeyError as e: raise KeyError(f"{entry_type} has no {attr_name} attribute.") from e @@ -986,7 +1104,9 @@ def get_attribute(self, tid: int, attr_name: str) -> Any: entry, entry_type = self.get_entry(tid) try: - attr_id = self._get_type_attribute_dict(entry_type)[attr_name] + attr_id = self._get_type_attribute_dict(entry_type)[attr_name][ + constants.ATTR_INDEX_KEY + ] except KeyError as e: raise KeyError(f"{entry_type} has no {attr_name} attribute.") from e @@ -1641,17 +1761,21 @@ def _parse_onto_file(self): attr_dict = {} idx = constants.ATTR_BEGIN_INDEX + type_dict = self._add_entry_types(entry_name, entry_node.attributes) + # sort the attribute dictionary for d in sorted(entry_node.attributes): - name = d - attr_dict[name] = idx + name = d[0] + attr_dict[name] = { + constants.ATTR_INDEX_KEY: idx, + constants.ATTR_TYPE_KEY: type_dict[name], + } idx += 1 entry_dict = {} entry_dict[constants.PARENT_CLASS_KEY] = set() entry_dict[constants.PARENT_CLASS_KEY].add(entry_node.parent.name) entry_dict[constants.TYPE_ATTR_KEY] = attr_dict - DataStore._type_attributes[entry_name] = entry_dict def _init_top_to_core_entries(self): diff --git a/forte/data/ontology/ontology_code_generator.py b/forte/data/ontology/ontology_code_generator.py index b43125cc9..84b7be48f 100644 --- a/forte/data/ontology/ontology_code_generator.py +++ b/forte/data/ontology/ontology_code_generator.py @@ -811,7 +811,8 @@ def parse_schema( module_writer.add_entry(en, entry_item) # Adding entry attributes to the allowed types for validation. - for property_name in properties: + for property in properties: + property_name = property[0] # Check if the name is allowed. if not property_name.isidentifier(): raise InvalidIdentifierException( @@ -826,7 +827,7 @@ def parse_schema( f"the ontology, will be overridden", DuplicatedAttributesWarning, ) - self.allowed_types_tree[en.class_name].add(property_name) + self.allowed_types_tree[en.class_name].add(property) # populate the entry tree based on information if merged_entry_tree is not None: curr_entry_name = en.class_name @@ -1032,16 +1033,17 @@ def parse_entry( property_items, property_names = [], [] for prop_schema in properties: # TODO: add test - prop_name = prop_schema["name"] - if prop_name in RESERVED_ATTRIBUTE_NAMES: + prop = (prop_schema["name"], prop_schema["type"]) + + if prop_schema["name"] in RESERVED_ATTRIBUTE_NAMES: raise InvalidIdentifierException( - f"The attribute name {prop_name} is reserved and cannot be " + f"The attribute name {prop_schema['name']} is reserved and cannot be " f"used, please consider changed the name. The list of " f"reserved name strings are " f"{RESERVED_ATTRIBUTE_NAMES}" ) - property_names.append(prop_schema["name"]) + property_names.append(prop) property_items.append(self.parse_property(entry_name, prop_schema)) # For special classes that requires a constraint. diff --git a/tests/forte/data/data_store_serialization_test.py b/tests/forte/data/data_store_serialization_test.py index f5048f5c2..511792ce0 100644 --- a/tests/forte/data/data_store_serialization_test.py +++ b/tests/forte/data/data_store_serialization_test.py @@ -16,11 +16,14 @@ """ import logging +from typing import Union import unittest import tempfile import os from sortedcontainers import SortedList from forte.data.data_store import DataStore +from forte.data.ontology.core import FDict +from ft.onto.base_ontology import Classification logging.basicConfig(level=logging.DEBUG) @@ -36,19 +39,29 @@ def setUp(self) -> None: DataStore._type_attributes = { "ft.onto.base_ontology.Document": { "attributes": { - "sentiment": 4, - "classifications": 5, + "document_class": {"index": 4, "type": (list, (str,))}, + "sentiment": {"index": 5, "type": (dict, (str, float))}, + "classifications": { + "index": 6, + "type": (FDict, (str, Classification)), + }, }, "parent_entry": "forte.data.ontology.top.Annotation", }, "ft.onto.base_ontology.Sentence": { "attributes": { - "sentiment": 4, - "speaker": 5, - "part_id": 6, - "classification_test": 7, - "classifications": 8, - "temp": 9, + "sentiment": {"index": 4, "type": (dict, (str, float))}, + "speaker": {"index": 5, "type": (Union, (str, type(None)))}, + "part_id": {"index": 6, "type": (Union, (int, type(None)))}, + "classification_test": { + "index": 7, + "type": (dict, (str, float)), + }, + "classifications": { + "index": 8, + "type": (FDict, (str, Classification)), + }, + "temp": {"index": 9, "type": (Union, (str, type(None)))}, }, "parent_entry": "forte.data.ontology.top.Annotation", }, @@ -73,7 +86,9 @@ def setUp(self) -> None: 1234, "ft.onto.base_ontology.Document", "Positive", - None, + ["Doc class A"], + {"Negative": 0}, + {}, ], [ 10, @@ -81,7 +96,9 @@ def setUp(self) -> None: 3456, "ft.onto.base_ontology.Document", "Negative", - "Class B", + ["Doc class B"], + {}, + {}, ], [ 15, @@ -89,7 +106,9 @@ def setUp(self) -> None: 4567, "ft.onto.base_ontology.Document", "Positive", - "Class C", + ["Doc class C"], + {"Negative": 0}, + {}, ], [ 20, @@ -97,7 +116,9 @@ def setUp(self) -> None: 5678, "ft.onto.base_ontology.Document", "Neutral", - "Class D", + ["Doc class D"], + {}, + {}, ], [ 40, @@ -105,7 +126,9 @@ def setUp(self) -> None: 7890, "ft.onto.base_ontology.Document", "Very Positive", - "Class E", + ["Doc class E"], + {"Positive": 0}, + {}, ], ], ), @@ -116,23 +139,23 @@ def setUp(self) -> None: 9, 9999, "ft.onto.base_ontology.Sentence", - "Positive", + {}, "teacher", 1, - None, - None, - "cba", + {"Negative": 0}, + {}, + "abc", ], [ 55, 70, 1234567, "ft.onto.base_ontology.Sentence", - "Negative", - None, - None, - "Class C", - "Class D", + {}, + "student", + 2, + {"Positive": 0}, + {}, "abc", ], [ @@ -140,24 +163,24 @@ def setUp(self) -> None: 90, 100, "ft.onto.base_ontology.Sentence", - "Positive", - "student", + {}, + "teacher", 2, - "testA", - "class1", - "bad", + {"Positive": 0}, + {}, + "cba", ], [ 65, 90, 5000, "ft.onto.base_ontology.Sentence", - "Positive", + {}, "TA", - 3, - "testB", - "class2", - "good", + 1, + {"Positive": 0}, + {}, + "bad", ], ], ), @@ -236,19 +259,34 @@ def test_save_attribute_pickle(self): DataStore._type_attributes = { "ft.onto.base_ontology.Document": { "attributes": { - "document_class": 4, - "sentiment": 5, - "classifications": 6, + "document_class": {"index": 4, "type": (list, (str,))}, + "sentiment": {"index": 5, "type": (dict, (str, float))}, + "classifications": { + "index": 6, + "type": (FDict, (str, Classification)), + }, }, "parent_entry": "forte.data.ontology.top.Annotation", }, "ft.onto.base_ontology.Sentence": { "attributes": { - "speaker": 4, - "part_id": 5, - "sentiment": 6, - "classification": 7, - "classifications": 8, + "speaker": { + "index": 4, + "type": (Union, (str, type(None))), + }, + "part_id": { + "index": 5, + "type": (Union, (int, type(None))), + }, + "sentiment": {"index": 6, "type": (dict, (str, float))}, + "classification": { + "index": 7, + "type": (dict, (str, float)), + }, + "classifications": { + "index": 8, + "type": (FDict, (str, Classification)), + }, }, "parent_entry": "forte.data.ontology.top.Annotation", }, @@ -277,45 +315,50 @@ def test_save_attribute_pickle(self): 5, 1234, "ft.onto.base_ontology.Document", - None, "Positive", - None, + ["Doc class A"], + {"Negative": 0}, + {}, ], [ 10, 25, 3456, "ft.onto.base_ontology.Document", - None, "Negative", - "Class B", + ["Doc class B"], + {}, + {}, ], [ 15, 20, 4567, "ft.onto.base_ontology.Document", - None, "Positive", - "Class C", + ["Doc class C"], + {"Negative": 0}, + {}, ], [ 20, 25, 5678, "ft.onto.base_ontology.Document", - None, "Neutral", - "Class D", + ["Doc class D"], + {}, + {}, ], [ 40, 55, 7890, "ft.onto.base_ontology.Document", - None, "Very Positive", - "Class E", + ["Doc class E"], + {"Positive": 0}, + {}, ], ], ), @@ -328,31 +371,31 @@ def test_save_attribute_pickle(self): "ft.onto.base_ontology.Sentence", "teacher", 1, - "Positive", - None, + {}, None, + {}, ], [ 55, 70, 1234567, "ft.onto.base_ontology.Sentence", + "student", + 2, + {}, None, - None, - "Negative", - None, - "Class D", + {}, ], [ 60, 90, 100, "ft.onto.base_ontology.Sentence", - "student", + "teacher", 2, - "Positive", + {}, None, - "class1", + {}, ], [ 65, @@ -360,10 +403,10 @@ def test_save_attribute_pickle(self): 5000, "ft.onto.base_ontology.Sentence", "TA", - 3, - "Positive", + 1, + {}, None, - "class2", + {}, ], ], ), @@ -423,14 +466,14 @@ def test_save_attribute_pickle(self): ][3], }, ) - + self.assertEqual( temp._DataStore__tid_idx_dict, { 10123: ["forte.data.ontology.top.Group", 0], 23456: ["forte.data.ontology.top.Group", 1], 88888: ["forte.data.ontology.top.Link", 0], - } + }, ) temp = DataStore.deserialize( @@ -448,7 +491,9 @@ def test_save_attribute_pickle(self): 1234, "ft.onto.base_ontology.Document", "Positive", - None, + ["Doc class A"], + {"Negative": 0}, + {}, ], [ 10, @@ -456,7 +501,9 @@ def test_save_attribute_pickle(self): 3456, "ft.onto.base_ontology.Document", "Negative", - "Class B", + ["Doc class B"], + {}, + {}, ], [ 15, @@ -464,7 +511,9 @@ def test_save_attribute_pickle(self): 4567, "ft.onto.base_ontology.Document", "Positive", - "Class C", + ["Doc class C"], + {"Negative": 0}, + {}, ], [ 20, @@ -472,7 +521,9 @@ def test_save_attribute_pickle(self): 5678, "ft.onto.base_ontology.Document", "Neutral", - "Class D", + ["Doc class D"], + {}, + {}, ], [ 40, @@ -480,7 +531,9 @@ def test_save_attribute_pickle(self): 7890, "ft.onto.base_ontology.Document", "Very Positive", - "Class E", + ["Doc class E"], + {"Positive": 0}, + {}, ], ], ), @@ -491,23 +544,23 @@ def test_save_attribute_pickle(self): 9, 9999, "ft.onto.base_ontology.Sentence", - "Positive", + {}, "teacher", 1, - None, - None, - "cba", + {"Negative": 0}, + {}, + "abc", ], [ 55, 70, 1234567, "ft.onto.base_ontology.Sentence", - "Negative", - None, - None, - "Class C", - "Class D", + {}, + "student", + 2, + {"Positive": 0}, + {}, "abc", ], [ @@ -515,26 +568,26 @@ def test_save_attribute_pickle(self): 90, 100, "ft.onto.base_ontology.Sentence", - "Positive", - "student", + {}, + "teacher", 2, - "testA", - "class1", - "bad", + {"Positive": 0}, + {}, + "cba", ], [ 65, 90, 5000, "ft.onto.base_ontology.Sentence", - "Positive", + {}, "TA", - 3, - "testB", - "class2", - "good", + 1, + {"Positive": 0}, + {}, + "bad", ], - ] + ], ), "forte.data.ontology.top.Group": [ [ @@ -557,7 +610,7 @@ def test_save_attribute_pickle(self): 88888, "forte.data.ontology.top.Link", ], - ], + ], }, ) self.assertEqual( @@ -599,14 +652,14 @@ def test_save_attribute_pickle(self): 10123: ["forte.data.ontology.top.Group", 0], 23456: ["forte.data.ontology.top.Group", 1], 88888: ["forte.data.ontology.top.Link", 0], - } + }, ) # test check_attribute with accept_unknown_attribute = False with self.assertRaisesRegex( ValueError, - "Saved ft.onto.base_ontology.Document objects have unidentified" - " fields at indices 4, which raise an error.", + "Saved ft.onto.base_ontology.Sentence objects have unidentified" + " fields at indices 7, which raise an error.", ): DataStore.deserialize( tmpfilepath, @@ -626,19 +679,34 @@ def test_fast_pickle(self): DataStore._type_attributes = { "ft.onto.base_ontology.Document": { "attributes": { - "document_class": 4, - "sentiment": 5, - "classifications": 6, + "document_class": {"index": 4, "type": (list, (str,))}, + "sentiment": {"index": 5, "type": (dict, (str, float))}, + "classifications": { + "index": 6, + "type": (FDict, (str, Classification)), + }, }, "parent_entry": "forte.data.ontology.top.Annotation", }, "ft.onto.base_ontology.Sentence": { "attributes": { - "speaker": 4, - "part_id": 5, - "sentiment": 6, - "classification": 7, - "classifications": 8, + "speaker": { + "index": 4, + "type": (Union, (str, type(None))), + }, + "part_id": { + "index": 5, + "type": (Union, (int, type(None))), + }, + "sentiment": {"index": 6, "type": (dict, (str, float))}, + "classification": { + "index": 7, + "type": (dict, (str, float)), + }, + "classifications": { + "index": 8, + "type": (FDict, (str, Classification)), + }, }, "parent_entry": "forte.data.ontology.top.Annotation", }, @@ -711,19 +779,34 @@ def test_delete_serialize(self): DataStore._type_attributes = { "ft.onto.base_ontology.Document": { "attributes": { - "document_class": 4, - "sentiment": 5, - "classifications": 6, + "document_class": {"index": 4, "type": (list, (str,))}, + "sentiment": {"index": 5, "type": (dict, (str, float))}, + "classifications": { + "index": 6, + "type": (FDict, (str, Classification)), + }, }, "parent_entry": "forte.data.ontology.top.Annotation", }, "ft.onto.base_ontology.Sentence": { "attributes": { - "speaker": 4, - "part_id": 5, - "sentiment": 6, - "classification": 7, - "classifications": 8, + "speaker": { + "index": 4, + "type": (Union, (str, type(None))), + }, + "part_id": { + "index": 5, + "type": (Union, (int, type(None))), + }, + "sentiment": {"index": 6, "type": (dict, (str, float))}, + "classification": { + "index": 7, + "type": (dict, (str, float)), + }, + "classifications": { + "index": 8, + "type": (FDict, (str, Classification)), + }, }, "parent_entry": "forte.data.ontology.top.Annotation", }, @@ -750,36 +833,40 @@ def test_delete_serialize(self): 5, 1234, "ft.onto.base_ontology.Document", - None, "Positive", - None, + ["Doc class A"], + {"Negative": 0}, + {}, ], [ 10, 25, 3456, "ft.onto.base_ontology.Document", - None, "Negative", - "Class B", + ["Doc class B"], + {}, + {}, ], [ 20, 25, 5678, "ft.onto.base_ontology.Document", - None, "Neutral", - "Class D", + ["Doc class D"], + {}, + {}, ], [ 40, 55, 7890, "ft.onto.base_ontology.Document", - None, "Very Positive", - "Class E", + ["Doc class E"], + {"Positive": 0}, + {}, ], ], ), @@ -792,31 +879,31 @@ def test_delete_serialize(self): "ft.onto.base_ontology.Sentence", "teacher", 1, - "Positive", - None, + {}, None, + {}, ], [ 55, 70, 1234567, "ft.onto.base_ontology.Sentence", + "student", + 2, + {}, None, - None, - "Negative", - None, - "Class D", + {}, ], [ 60, 90, 100, "ft.onto.base_ontology.Sentence", - "student", + "teacher", 2, - "Positive", + {}, None, - "class1", + {}, ], [ 65, @@ -824,10 +911,10 @@ def test_delete_serialize(self): 5000, "ft.onto.base_ontology.Sentence", "TA", - 3, - "Positive", + 1, + {}, None, - "class2", + {}, ], ], ), @@ -878,13 +965,13 @@ def test_delete_serialize(self): ][3], }, ) - + self.assertEqual( temp._DataStore__tid_idx_dict, { 23456: ["forte.data.ontology.top.Group", 0], 88888: ["forte.data.ontology.top.Link", 0], - } + }, ) diff --git a/tests/forte/data/data_store_test.py b/tests/forte/data/data_store_test.py index 7e155f19a..27930976e 100644 --- a/tests/forte/data/data_store_test.py +++ b/tests/forte/data/data_store_test.py @@ -20,7 +20,7 @@ import unittest import copy from sortedcontainers import SortedList -from typing import Optional, Dict +from typing import List, Optional, Dict, Union from dataclasses import dataclass from forte.data.data_store import DataStore from forte.data.ontology.top import ( @@ -35,6 +35,8 @@ ) from forte.data.data_pack import DataPack from forte.common import constants +from forte.data.ontology.core import FDict +from ft.onto.base_ontology import Classification logging.basicConfig(level=logging.DEBUG) @@ -126,19 +128,28 @@ def setUp(self) -> None: self.reference_type_attributes = { "ft.onto.base_ontology.Document": { "attributes": { - "document_class": 4, - "sentiment": 5, - "classifications": 6, + "document_class": {"index": 4, "type": (list, (str,))}, + "sentiment": {"index": 5, "type": (dict, (str, float))}, + "classifications": { + "index": 6, + "type": (FDict, (str, Classification)), + }, }, "parent_class": set(), }, "ft.onto.base_ontology.Sentence": { "attributes": { - "speaker": 4, - "part_id": 5, - "sentiment": 6, - "classification": 7, - "classifications": 8, + "speaker": {"index": 4, "type": (Union, (str, type(None)))}, + "part_id": {"index": 5, "type": (Union, (int, type(None)))}, + "sentiment": {"index": 6, "type": (dict, (str, float))}, + "classification": { + "index": 7, + "type": (dict, (str, float)), + }, + "classifications": { + "index": 8, + "type": (FDict, (str, Classification)), + }, }, "parent_class": set(), }, @@ -168,20 +179,26 @@ def setUp(self) -> None: DataStore._type_attributes["ft.onto.base_ontology.Document"] = { "attributes": { - "document_class": 4, - "sentiment": 5, - "classifications": 6, + "document_class": {"index": 4, "type": (list, (str,))}, + "sentiment": {"index": 5, "type": (dict, (str, float))}, + "classifications": { + "index": 6, + "type": (FDict, (str, Classification)), + }, }, "parent_class": set(), } DataStore._type_attributes["ft.onto.base_ontology.Sentence"] = { "attributes": { - "speaker": 4, - "part_id": 5, - "sentiment": 6, - "classification": 7, - "classifications": 8, + "speaker": {"index": 4, "type": (Union, (str, type(None)))}, + "part_id": {"index": 5, "type": (Union, (int, type(None)))}, + "sentiment": {"index": 6, "type": (dict, (str, float))}, + "classification": {"index": 7, "type": (dict, (str, float))}, + "classifications": { + "index": 8, + "type": (FDict, (str, Classification)), + }, }, "parent_class": set(), } @@ -1269,21 +1286,21 @@ def test_check_onto_file(self): expected_type_attributes = { "ft.onto.test.Description": { "attributes": { - "author": 4, - "passage_id": 5, + "author": {"index": 4, "type": (type(None), (str,))}, + "passage_id": {"index": 5, "type": (type(None), (str,))}, }, "parent_class": {"forte.data.ontology.top.Annotation"}, }, "ft.onto.test.EntityMention": { "attributes": { - "ner_type": 4, + "ner_type": {"index": 4, "type": (type(None), (str,))}, }, "parent_class": {"forte.data.ontology.top.Annotation"}, }, "ft.onto.test.MedicalEntityMention": { "attributes": { - "umls_entities": 4, - "umls_link": 5, + "umls_entities": {"index": 4, "type": (type(None), (int,))}, + "umls_link": {"index": 5, "type": (type(None), (str,))}, }, "parent_class": {"ft.onto.test.EntityMention"}, }, From 64d50df465071a8347edac136012ad934aff70ee Mon Sep 17 00:00:00 2001 From: Pushkar-Bhuse Date: Fri, 22 Jul 2022 12:58:42 -0700 Subject: [PATCH 02/20] ForwardRef import error --- forte/data/base_pack.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/forte/data/base_pack.py b/forte/data/base_pack.py index b3513139d..14e533217 100644 --- a/forte/data/base_pack.py +++ b/forte/data/base_pack.py @@ -20,7 +20,6 @@ from abc import abstractmethod from pathlib import Path from typing import ( - ForwardRef, List, Optional, Set, @@ -32,6 +31,7 @@ Any, Iterable, ) +from typing_inspect import is_forward_ref from functools import partial from packaging.version import Version import jsonpickle @@ -483,10 +483,7 @@ def entry_getter(cls: Entry, attr_name: str): # If the attribute was an Entry object, only its tid # is stored in the DataStore and hence its needs to be converted. if entry_type[1] and ( - any( - isinstance(entry, ForwardRef) - for entry in list(entry_type[1]) - ) + any(is_forward_ref(entry) for entry in list(entry_type[1])) or any( issubclass(entry, (Entry)) for entry in list(entry_type[1]) From 442297672367c771bd3e9a60414a974a4e34f16b Mon Sep 17 00:00:00 2001 From: Pushkar-Bhuse Date: Fri, 22 Jul 2022 13:08:20 -0700 Subject: [PATCH 03/20] default attribute type fix --- forte/data/data_store.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/forte/data/data_store.py b/forte/data/data_store.py index e38ed01f0..066024117 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -646,13 +646,14 @@ def _default_attributes_for_type(self, type_name: str) -> List: attr_dict (list): A list of attributes with default values. """ attr_dict: Dict = self._get_type_attribute_dict(type_name) + attr_fields: Dict = self._get_entry_attributes_by_class(type_name) attr_list: List = [None] * len(attr_dict) for attr_name, attr_info in attr_dict.items(): # TODO: We should keep a record of the attribute class instead of # inspecting the class on the fly. attr_id = attr_info[constants.ATTR_INDEX_KEY] - attr_class = attr_dict[attr_name][constants.ATTR_TYPE_KEY][0] + attr_class = get_origin(attr_fields[attr_name].type) if attr_class in (FList, list, List): attr_list[attr_id - constants.ATTR_BEGIN_INDEX] = [] elif attr_class in (FDict, dict, Dict): From 84f0e91454cb23acf8beb42e42b17600d91044c7 Mon Sep 17 00:00:00 2001 From: Pushkar-Bhuse Date: Fri, 22 Jul 2022 16:52:01 -0700 Subject: [PATCH 04/20] Serialization error fix --- forte/data/base_pack.py | 3 +-- forte/data/data_store.py | 8 ++++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/forte/data/base_pack.py b/forte/data/base_pack.py index 14e533217..1fcebf895 100644 --- a/forte/data/base_pack.py +++ b/forte/data/base_pack.py @@ -31,8 +31,8 @@ Any, Iterable, ) -from typing_inspect import is_forward_ref from functools import partial +from typing_inspect import is_forward_ref from packaging.version import Version import jsonpickle @@ -525,7 +525,6 @@ def entry_setter(cls: Entry, value: Any, attr_name: str): entry_type = data_store_ref.get_entry_types( cls.entry_type(), attr_name ) - # Assumption: Users will not assign value to a FList/FDict field. # Only internal methods can set the FList/FDict field, and value's # type has to be Iterator[Entry]/Dict[Any, Entry]. diff --git a/forte/data/data_store.py b/forte/data/data_store.py index 066024117..65f3f08d8 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -11,6 +11,7 @@ # See the License for the specific language governing permissions and # limitations under the License. +from copy import deepcopy import json from typing import Dict, List, Iterator, Set, Tuple, Optional, Any, Type @@ -271,10 +272,13 @@ def __getstate__(self): """ state = super().__getstate__() state["_DataStore__elements"] = {} + state["_DataStore_type_attributes"] = deepcopy(self._type_attributes) + for k in self.__elements: # build the full `_type_attributes` self._get_type_info(k) - for _, info in self._type_attributes[k][ + + for _, info in state["_DataStore_type_attributes"][k][ constants.TYPE_ATTR_KEY ].items(): info.pop(constants.ATTR_TYPE_KEY) @@ -284,7 +288,7 @@ def __getstate__(self): state.pop("_DataStore__deletion_count") state["entries"] = state.pop("_DataStore__elements") - state["fields"] = self._type_attributes + state["fields"] = state["_DataStore_type_attributes"] for _, v in state["fields"].items(): if constants.PARENT_CLASS_KEY in v: v.pop(constants.PARENT_CLASS_KEY) From 16bbc4f16a87c2e3003fb9c229f038bde03c9874 Mon Sep 17 00:00:00 2001 From: Pushkar-Bhuse Date: Fri, 22 Jul 2022 21:28:58 -0700 Subject: [PATCH 05/20] getstate logic modification --- forte/data/base_pack.py | 13 +++--- forte/data/data_store.py | 40 ++++++++++++++----- .../data/ontology/ontology_code_generator.py | 2 +- tests/forte/data/data_store_test.py | 22 +++++++--- 4 files changed, 54 insertions(+), 23 deletions(-) diff --git a/forte/data/base_pack.py b/forte/data/base_pack.py index 1fcebf895..3e507a618 100644 --- a/forte/data/base_pack.py +++ b/forte/data/base_pack.py @@ -20,6 +20,7 @@ from abc import abstractmethod from pathlib import Path from typing import ( + ForwardRef, List, Optional, Set, @@ -32,7 +33,7 @@ Iterable, ) from functools import partial -from typing_inspect import is_forward_ref +from inspect import isclass from packaging.version import Version import jsonpickle @@ -482,12 +483,10 @@ def entry_getter(cls: Entry, attr_name: str): # Check dataclass attribute value type # If the attribute was an Entry object, only its tid # is stored in the DataStore and hence its needs to be converted. - if entry_type[1] and ( - any(is_forward_ref(entry) for entry in list(entry_type[1])) - or any( - issubclass(entry, (Entry)) - for entry in list(entry_type[1]) - ) + if entry_type[1] and any( + issubclass(entry, (Entry, ForwardRef)) + for entry in list(entry_type[1]) + if isclass(entry) ): return cls.pack.get_entry(tid=attr_val) diff --git a/forte/data/data_store.py b/forte/data/data_store.py index 65f3f08d8..70f9f5d01 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -12,6 +12,7 @@ # limitations under the License. from copy import deepcopy +from inspect import isclass import json from typing import Dict, List, Iterator, Set, Tuple, Optional, Any, Type @@ -21,6 +22,8 @@ from sortedcontainers import SortedList from typing_inspect import get_origin, get_args +# from ft.onto.base_ontology import Utterance + from forte.utils import get_class from forte.utils.utils import get_full_module_name from forte.data.ontology.code_generation_objects import EntryTree @@ -272,23 +275,27 @@ def __getstate__(self): """ state = super().__getstate__() state["_DataStore__elements"] = {} - state["_DataStore_type_attributes"] = deepcopy(self._type_attributes) + + # Make a copy of the updated type_attributes + state["_type_attributes"] = deepcopy(DataStore._type_attributes) for k in self.__elements: + # build the full `_type_attributes` self._get_type_info(k) - - for _, info in state["_DataStore_type_attributes"][k][ + for _, info in state["_type_attributes"][k][ constants.TYPE_ATTR_KEY ].items(): info.pop(constants.ATTR_TYPE_KEY) + state["_DataStore__elements"][k] = list(self.__elements[k]) + state.pop("_DataStore__tid_ref_dict") state.pop("_DataStore__tid_idx_dict") state.pop("_DataStore__deletion_count") state["entries"] = state.pop("_DataStore__elements") - state["fields"] = state["_DataStore_type_attributes"] + state["fields"] = state["_type_attributes"] for _, v in state["fields"].items(): if constants.PARENT_CLASS_KEY in v: v.pop(constants.PARENT_CLASS_KEY) @@ -666,7 +673,8 @@ def _default_attributes_for_type(self, type_name: str) -> List: def _add_entry_types( self, type_name: str, attributes: Optional[Set[Tuple[str, str]]] = None - ) -> Dict: + ) -> Dict[str, Tuple]: + # [str, Tuple[Any, Tuple[Any]]]: r"""This function takes a fully qualified ``type_name`` class name, adds the type of all its dataclass attributes to the `_entry_type_dict` dictionary class variable. @@ -684,10 +692,14 @@ def _add_entry_types( } """ type_dict = {} + attr_class: Any + attr_args: Tuple if attributes: for attr, type_val in attributes: - type_dict[attr] = (type(None), (get_class(type_val),)) + attr_class = type(None) + attr_args = tuple([get_class(type_val)]) + type_dict[attr] = tuple([attr_class, attr_args]) else: attr_fields: Dict = self._get_entry_attributes_by_class(type_name) @@ -695,7 +707,15 @@ def _add_entry_types( attr_class = get_origin(attr_info.type) attr_args = get_args(attr_info.type) - type_dict[attr_name] = (attr_class, attr_args) + attr_class = ( + attr_class if isclass(attr_class) else attr_class.__class__ + ) + + attr_args = tuple( + val if isclass(val) else val.__class__ for val in attr_args + ) + + type_dict[attr_name] = tuple([attr_class, attr_args]) return type_dict @@ -803,9 +823,9 @@ def get_entry_types( stored in the ``_type_attributes`` dictionary of the Data Store. """ try: - return self._type_attributes[type_name][constants.TYPE_ATTR_KEY][ - attr_name - ][constants.ATTR_TYPE_KEY] + return DataStore._type_attributes[type_name][ + constants.TYPE_ATTR_KEY + ][attr_name][constants.ATTR_TYPE_KEY] except KeyError as e: raise KeyError( f"Attribute {attr_name} does not have type " diff --git a/forte/data/ontology/ontology_code_generator.py b/forte/data/ontology/ontology_code_generator.py index 84b7be48f..5a21fca83 100644 --- a/forte/data/ontology/ontology_code_generator.py +++ b/forte/data/ontology/ontology_code_generator.py @@ -968,7 +968,7 @@ def construct_init(self, entry_name: EntryName, base_entry: str): def parse_entry( self, entry_name: EntryName, schema: Dict - ) -> Tuple[EntryDefinition, List[str]]: + ) -> Tuple[EntryDefinition, List[Tuple[Any, Any]]]: """ Args: entry_name: Object holds various name form of the entry. diff --git a/tests/forte/data/data_store_test.py b/tests/forte/data/data_store_test.py index 27930976e..7225ce25d 100644 --- a/tests/forte/data/data_store_test.py +++ b/tests/forte/data/data_store_test.py @@ -20,7 +20,7 @@ import unittest import copy from sortedcontainers import SortedList -from typing import List, Optional, Dict, Union +from typing import List, Optional, Dict, Union, _SpecialForm from dataclasses import dataclass from forte.data.data_store import DataStore from forte.data.ontology.top import ( @@ -139,8 +139,14 @@ def setUp(self) -> None: }, "ft.onto.base_ontology.Sentence": { "attributes": { - "speaker": {"index": 4, "type": (Union, (str, type(None)))}, - "part_id": {"index": 5, "type": (Union, (int, type(None)))}, + "speaker": { + "index": 4, + "type": (_SpecialForm, (str, type(None))), + }, + "part_id": { + "index": 5, + "type": (_SpecialForm, (int, type(None))), + }, "sentiment": {"index": 6, "type": (dict, (str, float))}, "classification": { "index": 7, @@ -191,8 +197,14 @@ def setUp(self) -> None: DataStore._type_attributes["ft.onto.base_ontology.Sentence"] = { "attributes": { - "speaker": {"index": 4, "type": (Union, (str, type(None)))}, - "part_id": {"index": 5, "type": (Union, (int, type(None)))}, + "speaker": { + "index": 4, + "type": (_SpecialForm, (str, type(None))), + }, + "part_id": { + "index": 5, + "type": (_SpecialForm, (int, type(None))), + }, "sentiment": {"index": 6, "type": (dict, (str, float))}, "classification": {"index": 7, "type": (dict, (str, float))}, "classifications": { From 3e714ee21e71e5c05ad61bba6a124c2867f39429 Mon Sep 17 00:00:00 2001 From: Pushkar-Bhuse Date: Mon, 25 Jul 2022 10:00:02 -0700 Subject: [PATCH 06/20] remove forward_ref import usage --- forte/data/base_pack.py | 7 ++++--- forte/data/data_store.py | 10 ---------- tests/forte/data/data_store_test.py | 8 ++++---- 3 files changed, 8 insertions(+), 17 deletions(-) diff --git a/forte/data/base_pack.py b/forte/data/base_pack.py index 3e507a618..e1758885b 100644 --- a/forte/data/base_pack.py +++ b/forte/data/base_pack.py @@ -20,7 +20,6 @@ from abc import abstractmethod from pathlib import Path from typing import ( - ForwardRef, List, Optional, Set, @@ -34,6 +33,7 @@ ) from functools import partial from inspect import isclass +from typing_inspect import is_forward_ref from packaging.version import Version import jsonpickle @@ -484,9 +484,10 @@ def entry_getter(cls: Entry, attr_name: str): # If the attribute was an Entry object, only its tid # is stored in the DataStore and hence its needs to be converted. if entry_type[1] and any( - issubclass(entry, (Entry, ForwardRef)) - for entry in list(entry_type[1]) + issubclass(entry, Entry) if isclass(entry) + else is_forward_ref(entry) + for entry in list(entry_type[1]) ): return cls.pack.get_entry(tid=attr_val) diff --git a/forte/data/data_store.py b/forte/data/data_store.py index 70f9f5d01..1e8d7a9af 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -12,7 +12,6 @@ # limitations under the License. from copy import deepcopy -from inspect import isclass import json from typing import Dict, List, Iterator, Set, Tuple, Optional, Any, Type @@ -674,7 +673,6 @@ def _default_attributes_for_type(self, type_name: str) -> List: def _add_entry_types( self, type_name: str, attributes: Optional[Set[Tuple[str, str]]] = None ) -> Dict[str, Tuple]: - # [str, Tuple[Any, Tuple[Any]]]: r"""This function takes a fully qualified ``type_name`` class name, adds the type of all its dataclass attributes to the `_entry_type_dict` dictionary class variable. @@ -707,14 +705,6 @@ def _add_entry_types( attr_class = get_origin(attr_info.type) attr_args = get_args(attr_info.type) - attr_class = ( - attr_class if isclass(attr_class) else attr_class.__class__ - ) - - attr_args = tuple( - val if isclass(val) else val.__class__ for val in attr_args - ) - type_dict[attr_name] = tuple([attr_class, attr_args]) return type_dict diff --git a/tests/forte/data/data_store_test.py b/tests/forte/data/data_store_test.py index 7225ce25d..b9c0cc99c 100644 --- a/tests/forte/data/data_store_test.py +++ b/tests/forte/data/data_store_test.py @@ -141,11 +141,11 @@ def setUp(self) -> None: "attributes": { "speaker": { "index": 4, - "type": (_SpecialForm, (str, type(None))), + "type": (Union, (str, type(None))), }, "part_id": { "index": 5, - "type": (_SpecialForm, (int, type(None))), + "type": (Union, (int, type(None))), }, "sentiment": {"index": 6, "type": (dict, (str, float))}, "classification": { @@ -199,11 +199,11 @@ def setUp(self) -> None: "attributes": { "speaker": { "index": 4, - "type": (_SpecialForm, (str, type(None))), + "type": (Union, (str, type(None))), }, "part_id": { "index": 5, - "type": (_SpecialForm, (int, type(None))), + "type": (Union, (int, type(None))), }, "sentiment": {"index": 6, "type": (dict, (str, float))}, "classification": {"index": 7, "type": (dict, (str, float))}, From 34eeb79865c2ac62243db219d61a8b9f3a0e18bc Mon Sep 17 00:00:00 2001 From: Pushkar-Bhuse Date: Mon, 25 Jul 2022 19:16:27 -0700 Subject: [PATCH 07/20] Multipack handling --- forte/data/base_pack.py | 14 +++++++++----- forte/data/ontology/code_generation_objects.py | 18 +++++++++++------- forte/data/ontology/ontology_code_generator.py | 9 ++++++--- tests/forte/data/data_store_test.py | 2 +- 4 files changed, 27 insertions(+), 16 deletions(-) diff --git a/forte/data/base_pack.py b/forte/data/base_pack.py index e1758885b..0e18fd106 100644 --- a/forte/data/base_pack.py +++ b/forte/data/base_pack.py @@ -483,11 +483,15 @@ def entry_getter(cls: Entry, attr_name: str): # Check dataclass attribute value type # If the attribute was an Entry object, only its tid # is stored in the DataStore and hence its needs to be converted. - if entry_type[1] and any( - issubclass(entry, Entry) - if isclass(entry) - else is_forward_ref(entry) - for entry in list(entry_type[1]) + if ( + isinstance(attr_val, int) + and entry_type[1] + and any( + issubclass(entry, Entry) + if isclass(entry) + else is_forward_ref(entry) + for entry in list(entry_type[1]) + ) ): return cls.pack.get_entry(tid=attr_val) diff --git a/forte/data/ontology/code_generation_objects.py b/forte/data/ontology/code_generation_objects.py index 2098fbd0a..c28649f85 100644 --- a/forte/data/ontology/code_generation_objects.py +++ b/forte/data/ontology/code_generation_objects.py @@ -797,7 +797,7 @@ def __init__(self, name: str): self.children: List[EntryTreeNode] = [] self.parent: Optional[EntryTreeNode] = None self.name: str = name - self.attributes: Set[str] = set() + self.attributes: Set[Tuple] = set() def __repr__(self): r"""for printing purpose.""" @@ -817,7 +817,7 @@ def add_node( self, curr_entry_name: str, parent_entry_name: str, - curr_entry_attr: Set[str], + curr_entry_attr: Set[Tuple], ): r"""Add a tree node with `curr_entry_name` as a child to `parent_entry_name` in the tree, the attributes `curr_entry_attr` @@ -864,9 +864,9 @@ def collect_parents(self, node_dict: Dict[str, Set[str]]): found_node = search(self.root, search_node_name=node_name) if found_node is not None: while found_node.parent.name != "root": - node_dict[ - found_node.parent.name - ] = found_node.parent.attributes + node_dict[found_node.parent.name] = set( + val[0] for val in found_node.parent.attributes + ) found_node = found_node.parent def todict(self) -> Dict[str, Any]: @@ -906,12 +906,16 @@ def fromdict( if parent_entry_name is None: self.root = EntryTreeNode(name=tree_dict["name"]) - self.root.attributes = set(tree_dict["attributes"]) + self.root.attributes = set( + tuple(attr) for attr in tree_dict["attributes"] + ) else: self.add_node( curr_entry_name=tree_dict["name"], parent_entry_name=parent_entry_name, - curr_entry_attr=set(tree_dict["attributes"]), + curr_entry_attr=set( + tuple(attr) for attr in tree_dict["attributes"] + ), ) for child in tree_dict["children"]: self.fromdict(child, tree_dict["name"]) diff --git a/forte/data/ontology/ontology_code_generator.py b/forte/data/ontology/ontology_code_generator.py index 5a21fca83..06b240ae9 100644 --- a/forte/data/ontology/ontology_code_generator.py +++ b/forte/data/ontology/ontology_code_generator.py @@ -269,7 +269,8 @@ def __init__( # Adjacency list to store the allowed types (in-built or user-defined), # and their attributes (if any) in order to validate the attribute # types. - self.allowed_types_tree: Dict[str, Set] = {} + self.allowed_types_tree: Dict[str, Set[Tuple]] = {} + for type_str in ALL_INBUILT_TYPES: self.allowed_types_tree[type_str] = set() @@ -820,14 +821,16 @@ def parse_schema( f"python identifier." ) - if property_name in self.allowed_types_tree[en.class_name]: + if property_name in set( + val[0] for val in self.allowed_types_tree[en.class_name] + ): warnings.warn( f"Attribute type for the entry {en.class_name} " f"and the attribute {property_name} already present in " f"the ontology, will be overridden", DuplicatedAttributesWarning, ) - self.allowed_types_tree[en.class_name].add(property) + self.allowed_types_tree[en.class_name].add(tuple(property)) # populate the entry tree based on information if merged_entry_tree is not None: curr_entry_name = en.class_name diff --git a/tests/forte/data/data_store_test.py b/tests/forte/data/data_store_test.py index b9c0cc99c..806be4747 100644 --- a/tests/forte/data/data_store_test.py +++ b/tests/forte/data/data_store_test.py @@ -20,7 +20,7 @@ import unittest import copy from sortedcontainers import SortedList -from typing import List, Optional, Dict, Union, _SpecialForm +from typing import List, Optional, Dict, Union from dataclasses import dataclass from forte.data.data_store import DataStore from forte.data.ontology.top import ( From 861ffdfe66af8edadfbe856c2a40fb3ddd718f4c Mon Sep 17 00:00:00 2001 From: Pushkar-Bhuse Date: Tue, 26 Jul 2022 09:43:06 -0700 Subject: [PATCH 08/20] Extra key removal from state --- forte/data/data_store.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/forte/data/data_store.py b/forte/data/data_store.py index bd2b57e09..e9fcb0dc7 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -278,7 +278,6 @@ def __getstate__(self): state["_type_attributes"] = deepcopy(DataStore._type_attributes) for k in self.__elements: - # build the full `_type_attributes` self._get_type_info(k) for _, info in state["_type_attributes"][k][ @@ -292,8 +291,7 @@ def __getstate__(self): state.pop("_DataStore__tid_idx_dict") state.pop("_DataStore__deletion_count") state["entries"] = state.pop("_DataStore__elements") - - state["fields"] = state["_type_attributes"] + state["fields"] = state.pop("_type_attributes") for _, v in state["fields"].items(): if constants.PARENT_CLASS_KEY in v: v.pop(constants.PARENT_CLASS_KEY) From 78d8d12fdc8f85d19b3b1921534ef9a1e8051aa4 Mon Sep 17 00:00:00 2001 From: Pushkar-Bhuse Date: Wed, 27 Jul 2022 11:50:57 -0700 Subject: [PATCH 09/20] Handling python 3.6 fix --- forte/data/data_store.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/forte/data/data_store.py b/forte/data/data_store.py index e9fcb0dc7..5f667ec18 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -21,8 +21,6 @@ from sortedcontainers import SortedList from typing_inspect import get_origin, get_args -# from ft.onto.base_ontology import Utterance - from forte.utils import get_class from forte.utils.utils import get_full_module_name from forte.data.ontology.code_generation_objects import EntryTree @@ -275,7 +273,8 @@ def __getstate__(self): state["_DataStore__elements"] = {} # Make a copy of the updated type_attributes - state["_type_attributes"] = deepcopy(DataStore._type_attributes) + type_attributes = deepcopy(DataStore._type_attributes) + state["_type_attributes"] = DataStore._type_attributes for k in self.__elements: # build the full `_type_attributes` @@ -295,6 +294,7 @@ def __getstate__(self): for _, v in state["fields"].items(): if constants.PARENT_CLASS_KEY in v: v.pop(constants.PARENT_CLASS_KEY) + DataStore._type_attributes = type_attributes return state def __setstate__(self, state): @@ -702,6 +702,14 @@ def _add_entry_types( attr_class = get_origin(attr_info.type) attr_args = get_args(attr_info.type) + # Prior to Python 3.7, typing.List and typing.Dict + # is not converted to primitive forms of list and + # dict. We handle them separately here + if attr_class == Dict: + attr_class = dict + if attr_class == List: + attr_class = list + type_dict[attr_name] = tuple([attr_class, attr_args]) return type_dict From b06ebefc60654045cfec84e7fc0c47cfd3268095 Mon Sep 17 00:00:00 2001 From: Pushkar-Bhuse Date: Wed, 27 Jul 2022 12:58:24 -0700 Subject: [PATCH 10/20] payload cache and embedding handling --- forte/data/ontology/top.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index f2e6f8de7..3dce620cf 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -1250,6 +1250,12 @@ def __getstate__(self): # Entry store is being integrated into DataStore state = self.__dict__.copy() state["_modality"] = self._modality.name + + if isinstance(state["_cache"], np.ndarray): + state["_cache"] = list(self._cache.tolist()) + if isinstance(state["_embedding"], np.ndarray): + state["_embedding"] = list(self._embedding.tolist()) + return state def __setstate__(self, state): @@ -1261,6 +1267,15 @@ def __setstate__(self, state): self.__dict__.update(state) self._modality = getattr(Modality, state["_modality"]) + # During de-serialization, convert the list back to numpy array. + if "_embedding" in state: + state["_embedding"] = np.array(state["_embedding"]) + else: + state["_embedding"] = np.empty(0) + + if "_cache" in state and isinstance(state["_cache"], list): + state["_cache"] = np.array(state["_cache"]) + SinglePackEntries = ( Link, From b0606cff8b435e1395a622f1eab6c29af3d6e0af Mon Sep 17 00:00:00 2001 From: Pushkar-Bhuse Date: Wed, 27 Jul 2022 13:15:29 -0700 Subject: [PATCH 11/20] Documentation fix --- forte/data/data_store.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/forte/data/data_store.py b/forte/data/data_store.py index 5f667ec18..8c467ac89 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -808,13 +808,13 @@ def get_entry_types( in an entry of type ``type_name`` Args: - type_name (str): The type name of the entry whose attribute enty + type_name (str): The type name of the entry whose attribute entry type needs to be fetched attr_name (str): The name of the attribute in the entry whose type information needs to be fetched. Returns: - The type information of the required attribute. This infromation is + The type information of the required attribute. This information is stored in the ``_type_attributes`` dictionary of the Data Store. """ try: From 17e51cfd104a203ca7a924c9023596ec58cca3ba Mon Sep 17 00:00:00 2001 From: Pushkar-Bhuse Date: Wed, 3 Aug 2022 12:33:28 -0700 Subject: [PATCH 12/20] docstrings and review changes --- forte/common/constants.py | 2 +- forte/data/base_pack.py | 10 +- forte/data/data_store.py | 100 ++++++++++++------ .../data/ontology/ontology_code_generator.py | 12 ++- .../data/data_store_serialization_test.py | 57 +++------- 5 files changed, 99 insertions(+), 82 deletions(-) diff --git a/forte/common/constants.py b/forte/common/constants.py index ac90f7d35..1032f8310 100644 --- a/forte/common/constants.py +++ b/forte/common/constants.py @@ -36,7 +36,7 @@ # Name of the key to access the attribute dict of an entry type from # ``_type_attributes`` of ``DataStore``. -TYPE_ATTR_KEY = "attributes" +ATTR_INFO_KEY = "attributes" # Name of the key to access the type of an attribute from # ``_type_attributes`` of ``DataStore``. diff --git a/forte/data/base_pack.py b/forte/data/base_pack.py index 0e18fd106..0e6a5229f 100644 --- a/forte/data/base_pack.py +++ b/forte/data/base_pack.py @@ -483,6 +483,14 @@ def entry_getter(cls: Entry, attr_name: str): # Check dataclass attribute value type # If the attribute was an Entry object, only its tid # is stored in the DataStore and hence its needs to be converted. + + # Entry objects are stored in data stores by their tid (which is + # of type int). Thus, if we enounter an int value, we check the + # type information which is stored as a tuple. if any entry in this + # tuple is a subclass of Entry or is a ForwardRef to another entry, + # we can infer that this int value represents the tid of an Entry + # object and thus must be converted to an object using get_entry + # before returning. if ( isinstance(attr_val, int) and entry_type[1] @@ -571,7 +579,7 @@ def entry_setter(cls: Entry, value: Any, attr_name: str): self._save_entry_to_data_store(entry=entry) # Register property functions for all dataclass fields. - for name, _ in entry.__dataclass_fields__.items(): + for name in entry.__dataclass_fields__: # Convert the typing annotation to the original class. # This will be used to determine if a field is FList/FDict. setattr( diff --git a/forte/data/data_store.py b/forte/data/data_store.py index 8c467ac89..b7f059ae4 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -162,10 +162,15 @@ def __init__( The keys are fully qualified names of every type; The value is a dictionary with two keys. Key ``attribute`` provides an inner dictionary with all valid attributes for this type and the information of attributes - among these lists. This is information is represented as a dictionary. The - information represented in this dictionary is the index of the attribute - and the type of the variable it stores. Key ``parent_class`` is a string - representing the ancestors of this type. + among these lists. This information is represented as a dictionary. The + dictionary has two entries, the first is index which determines the position + where an attribute is stored in a data store entry. The second is type, which + is a tuple of two elements that provides the type information of a given + attribute. The first element is the unsubscripted version of + the attribute's type and the second element is the type arguments + for the same. The information represented in this dictionary is the index of + the attribute and the type of the variable it stores. Key ``parent_class`` is + a string representing the ancestors of this type. This structure is supposed to be built dynamically. When a user adds new entries, `DataStore` will check unknown types and add them to @@ -274,26 +279,24 @@ def __getstate__(self): # Make a copy of the updated type_attributes type_attributes = deepcopy(DataStore._type_attributes) - state["_type_attributes"] = DataStore._type_attributes + state["fields"] = DataStore._type_attributes for k in self.__elements: # build the full `_type_attributes` self._get_type_info(k) - for _, info in state["_type_attributes"][k][ - constants.TYPE_ATTR_KEY - ].items(): - info.pop(constants.ATTR_TYPE_KEY) - state["_DataStore__elements"][k] = list(self.__elements[k]) state.pop("_DataStore__tid_ref_dict") state.pop("_DataStore__tid_idx_dict") state.pop("_DataStore__deletion_count") state["entries"] = state.pop("_DataStore__elements") - state["fields"] = state.pop("_type_attributes") for _, v in state["fields"].items(): if constants.PARENT_CLASS_KEY in v: v.pop(constants.PARENT_CLASS_KEY) + + if constants.ATTR_INFO_KEY in v: + for _, info in v[constants.ATTR_INFO_KEY].items(): + info.pop(constants.ATTR_TYPE_KEY) DataStore._type_attributes = type_attributes return state @@ -314,11 +317,11 @@ def __setstate__(self, state): # Update `_type_attributes` to store the types of each # entry attribute as well. for tn in self._type_attributes: - entry_type = self._add_entry_types(tn) + entry_type = self._new_entry_types(tn) for attr, type_val in entry_type.items(): try: info_dict = self._type_attributes[tn][ - constants.TYPE_ATTR_KEY + constants.ATTR_INFO_KEY ][attr] except KeyError: continue @@ -439,7 +442,7 @@ def check_fields(store): # name and index get_temp_rep = lambda entry: set( (attr, val[constants.ATTR_INDEX_KEY]) - for attr, val in entry[constants.TYPE_ATTR_KEY].items() + for attr, val in entry[constants.ATTR_INFO_KEY].items() ) temp_cls_rep = get_temp_rep(v) @@ -452,12 +455,12 @@ def check_fields(store): # fields to match the order of the current class. if ( f[0] - in store._type_attributes[t][constants.TYPE_ATTR_KEY] + in store._type_attributes[t][constants.ATTR_INFO_KEY] ): # record indices of the same field in the class and # objects. Save different indices to a dictionary. change_map[f[1]] = store._type_attributes[t][ - constants.TYPE_ATTR_KEY + constants.ATTR_INFO_KEY ][f[0]][constants.ATTR_INDEX_KEY] # record indices of fields that only appear in the # current class. We want to fill them with None. @@ -484,7 +487,7 @@ def check_fields(store): max( info[constants.ATTR_INDEX_KEY] for info in v[ - constants.TYPE_ATTR_KEY + constants.ATTR_INFO_KEY ].values() ) + 1 @@ -546,7 +549,8 @@ def _get_type_info(self, type_name: str) -> Dict[str, Any]: ``DataStore._type_attributes``. If the ``type_name`` does not currently exists and dynamic import is enabled, this function will add a new key-value pair into ``DataStore._type_attributes``. The value consists - of a full attribute-to-index dictionary and an empty parent set. + a dictionary which stores the name and the type information of every + attribute of the entry and an empty parent set. This function returns a dictionary containing an attribute dict and a set of parent entries of the given type. For example: @@ -577,7 +581,7 @@ def _get_type_info(self, type_name: str) -> Dict[str, Any]: # check if type is in dictionary if ( type_name in DataStore._type_attributes - and constants.TYPE_ATTR_KEY in DataStore._type_attributes[type_name] + and constants.ATTR_INFO_KEY in DataStore._type_attributes[type_name] ): return DataStore._type_attributes[type_name] if not self._dynamically_add_type: @@ -592,7 +596,7 @@ def _get_type_info(self, type_name: str) -> Dict[str, Any]: attr_dict = {} attr_idx = constants.ENTRY_TYPE_INDEX + 1 - type_dict = self._add_entry_types(type_name) + type_dict = self._new_entry_types(type_name) for attr_name in attributes: attr_dict[attr_name] = { @@ -602,7 +606,7 @@ def _get_type_info(self, type_name: str) -> Dict[str, Any]: attr_idx += 1 new_entry_info = { - constants.TYPE_ATTR_KEY: attr_dict, + constants.ATTR_INFO_KEY: attr_dict, constants.PARENT_CLASS_KEY: set(), } DataStore._type_attributes[type_name] = new_entry_info @@ -627,7 +631,7 @@ def _get_type_attribute_dict(self, type_name: str) -> Dict[str, Dict]: Returns: attr_dict (dict): The attribute-to-index dictionary of an entry. """ - return self._get_type_info(type_name)[constants.TYPE_ATTR_KEY] + return self._get_type_info(type_name)[constants.ATTR_INFO_KEY] def _get_type_parent(self, type_name: str) -> str: """Get a set of parent names of an entry type. The set is a subset of all @@ -667,24 +671,41 @@ def _default_attributes_for_type(self, type_name: str) -> List: attr_list[attr_id - constants.ATTR_BEGIN_INDEX] = {} return attr_list - def _add_entry_types( + def _new_entry_types( self, type_name: str, attributes: Optional[Set[Tuple[str, str]]] = None ) -> Dict[str, Tuple]: - r"""This function takes a fully qualified ``type_name`` class name, - adds the type of all its dataclass attributes to the - `_entry_type_dict` dictionary class variable. + r"""This function takes a fully qualified ``type_name`` class name + and creates a dictionary where the key is attribute of the entry + and value is the type information of that attribute. For example, + + .. code-block:: python + + type_dict = { + "document_class": (list, (str,)), + "sentiment": (dict, (str, float)), + "classifications": (FDict, (str, Classification)) + } + + For each attribute, the type information is represented by a tuple + of two elements. The first element is the unsubscripted version of + the attribute's type and the second element is the type arguments + for the same. The `type_dict` is used to populate the type + information for attributes of an entry specified by ``type_name`` + in `_type_attributes`. + Args: type_name: A fully qualified name of an entry class. attributes: This argument is used when parsing ontology files. The entries in the set are a tuples of two elements. + .. code-block:: python - attributes = { - ('passage_id', 'str'), - ('author', 'str') - } + attributes = { + ('passage_id', 'str'), + ('author', 'str') + } """ type_dict = {} attr_class: Any @@ -692,6 +713,14 @@ def _add_entry_types( if attributes: for attr, type_val in attributes: + # the type_dict only stores the type of each + # attribute class. When attributes and their + # types are defined in ontology files, these + # values are stored in attr_args. attr_class + # is empty in this case and has a value of + # None. But to maintain the consistency of + # type_dict, we only store the type of every + # value, even None. attr_class = type(None) attr_args = tuple([get_class(type_val)]) type_dict[attr] = tuple([attr_class, attr_args]) @@ -819,12 +848,13 @@ def get_entry_types( """ try: return DataStore._type_attributes[type_name][ - constants.TYPE_ATTR_KEY + constants.ATTR_INFO_KEY ][attr_name][constants.ATTR_TYPE_KEY] except KeyError as e: raise KeyError( f"Attribute {attr_name} does not have type " - "information provided" + f"information provided or attribute {attr_name}" + f"is not a valid attribute of entry {type_name}" ) from e def all_entries(self, entry_type_name: str) -> Iterator[List]: @@ -1771,6 +1801,8 @@ def _parse_onto_file(self): children = entry_tree.root.children while len(children) > 0: + # entry_node represents a node in the ontology tree + # generated by parsing an existing ontology file entry_node = children.pop(0) children.extend(entry_node.children) @@ -1780,7 +1812,7 @@ def _parse_onto_file(self): attr_dict = {} idx = constants.ATTR_BEGIN_INDEX - type_dict = self._add_entry_types(entry_name, entry_node.attributes) + type_dict = self._new_entry_types(entry_name, entry_node.attributes) # sort the attribute dictionary for d in sorted(entry_node.attributes): @@ -1794,7 +1826,7 @@ def _parse_onto_file(self): entry_dict = {} entry_dict[constants.PARENT_CLASS_KEY] = set() entry_dict[constants.PARENT_CLASS_KEY].add(entry_node.parent.name) - entry_dict[constants.TYPE_ATTR_KEY] = attr_dict + entry_dict[constants.ATTR_INFO_KEY] = attr_dict DataStore._type_attributes[entry_name] = entry_dict def _init_top_to_core_entries(self): diff --git a/forte/data/ontology/ontology_code_generator.py b/forte/data/ontology/ontology_code_generator.py index 06b240ae9..ea80ef85c 100644 --- a/forte/data/ontology/ontology_code_generator.py +++ b/forte/data/ontology/ontology_code_generator.py @@ -811,6 +811,10 @@ def parse_schema( # Add entry item to the writer. module_writer.add_entry(en, entry_item) + valid_properties = set( + val[0] for val in self.allowed_types_tree[en.class_name] + ) + # Adding entry attributes to the allowed types for validation. for property in properties: property_name = property[0] @@ -821,9 +825,7 @@ def parse_schema( f"python identifier." ) - if property_name in set( - val[0] for val in self.allowed_types_tree[en.class_name] - ): + if property_name in valid_properties: warnings.warn( f"Attribute type for the entry {en.class_name} " f"and the attribute {property_name} already present in " @@ -1036,6 +1038,10 @@ def parse_entry( property_items, property_names = [], [] for prop_schema in properties: # TODO: add test + + # the prop attributes will store the properties of each attribute + # of the the entry defined by the ontology. The properties are + # the name of the attribute and its data type. prop = (prop_schema["name"], prop_schema["type"]) if prop_schema["name"] in RESERVED_ATTRIBUTE_NAMES: diff --git a/tests/forte/data/data_store_serialization_test.py b/tests/forte/data/data_store_serialization_test.py index 511792ce0..dbe9b7be4 100644 --- a/tests/forte/data/data_store_serialization_test.py +++ b/tests/forte/data/data_store_serialization_test.py @@ -39,10 +39,10 @@ def setUp(self) -> None: DataStore._type_attributes = { "ft.onto.base_ontology.Document": { "attributes": { - "document_class": {"index": 4, "type": (list, (str,))}, - "sentiment": {"index": 5, "type": (dict, (str, float))}, + # "document_class": {"index": 4, "type": (list, (str,))}, + "sentiment": {"index": 4, "type": (dict, (str, float))}, "classifications": { - "index": 6, + "index": 5, "type": (FDict, (str, Classification)), }, }, @@ -86,9 +86,7 @@ def setUp(self) -> None: 1234, "ft.onto.base_ontology.Document", "Positive", - ["Doc class A"], {"Negative": 0}, - {}, ], [ 10, @@ -96,8 +94,6 @@ def setUp(self) -> None: 3456, "ft.onto.base_ontology.Document", "Negative", - ["Doc class B"], - {}, {}, ], [ @@ -106,9 +102,7 @@ def setUp(self) -> None: 4567, "ft.onto.base_ontology.Document", "Positive", - ["Doc class C"], {"Negative": 0}, - {}, ], [ 20, @@ -116,8 +110,6 @@ def setUp(self) -> None: 5678, "ft.onto.base_ontology.Document", "Neutral", - ["Doc class D"], - {}, {}, ], [ @@ -126,9 +118,7 @@ def setUp(self) -> None: 7890, "ft.onto.base_ontology.Document", "Very Positive", - ["Doc class E"], {"Positive": 0}, - {}, ], ], ), @@ -315,19 +305,17 @@ def test_save_attribute_pickle(self): 5, 1234, "ft.onto.base_ontology.Document", + None, "Positive", - ["Doc class A"], {"Negative": 0}, - {}, ], [ 10, 25, 3456, "ft.onto.base_ontology.Document", + None, "Negative", - ["Doc class B"], - {}, {}, ], [ @@ -335,19 +323,17 @@ def test_save_attribute_pickle(self): 20, 4567, "ft.onto.base_ontology.Document", + None, "Positive", - ["Doc class C"], {"Negative": 0}, - {}, ], [ 20, 25, 5678, "ft.onto.base_ontology.Document", + None, "Neutral", - ["Doc class D"], - {}, {}, ], [ @@ -355,10 +341,9 @@ def test_save_attribute_pickle(self): 55, 7890, "ft.onto.base_ontology.Document", + None, "Very Positive", - ["Doc class E"], {"Positive": 0}, - {}, ], ], ), @@ -491,9 +476,7 @@ def test_save_attribute_pickle(self): 1234, "ft.onto.base_ontology.Document", "Positive", - ["Doc class A"], {"Negative": 0}, - {}, ], [ 10, @@ -501,8 +484,6 @@ def test_save_attribute_pickle(self): 3456, "ft.onto.base_ontology.Document", "Negative", - ["Doc class B"], - {}, {}, ], [ @@ -511,9 +492,7 @@ def test_save_attribute_pickle(self): 4567, "ft.onto.base_ontology.Document", "Positive", - ["Doc class C"], {"Negative": 0}, - {}, ], [ 20, @@ -521,8 +500,6 @@ def test_save_attribute_pickle(self): 5678, "ft.onto.base_ontology.Document", "Neutral", - ["Doc class D"], - {}, {}, ], [ @@ -531,9 +508,7 @@ def test_save_attribute_pickle(self): 7890, "ft.onto.base_ontology.Document", "Very Positive", - ["Doc class E"], {"Positive": 0}, - {}, ], ], ), @@ -658,8 +633,8 @@ def test_save_attribute_pickle(self): # test check_attribute with accept_unknown_attribute = False with self.assertRaisesRegex( ValueError, - "Saved ft.onto.base_ontology.Sentence objects have unidentified" - " fields at indices 7, which raise an error.", + "Saved ft.onto.base_ontology.Document objects have unidentified" + " fields at indices 4, which raise an error.", ): DataStore.deserialize( tmpfilepath, @@ -833,19 +808,17 @@ def test_delete_serialize(self): 5, 1234, "ft.onto.base_ontology.Document", + None, "Positive", - ["Doc class A"], {"Negative": 0}, - {}, ], [ 10, 25, 3456, "ft.onto.base_ontology.Document", + None, "Negative", - ["Doc class B"], - {}, {}, ], [ @@ -853,9 +826,8 @@ def test_delete_serialize(self): 25, 5678, "ft.onto.base_ontology.Document", + None, "Neutral", - ["Doc class D"], - {}, {}, ], [ @@ -863,10 +835,9 @@ def test_delete_serialize(self): 55, 7890, "ft.onto.base_ontology.Document", + None, "Very Positive", - ["Doc class E"], {"Positive": 0}, - {}, ], ], ), From 8e3affbaa4ec062b31047da7b55afe6c3acad19c Mon Sep 17 00:00:00 2001 From: Pushkar-Bhuse Date: Wed, 3 Aug 2022 13:58:11 -0700 Subject: [PATCH 13/20] ontology parsing fix --- forte/data/ontology/ontology_code_generator.py | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/forte/data/ontology/ontology_code_generator.py b/forte/data/ontology/ontology_code_generator.py index ea80ef85c..88a6956af 100644 --- a/forte/data/ontology/ontology_code_generator.py +++ b/forte/data/ontology/ontology_code_generator.py @@ -811,10 +811,6 @@ def parse_schema( # Add entry item to the writer. module_writer.add_entry(en, entry_item) - valid_properties = set( - val[0] for val in self.allowed_types_tree[en.class_name] - ) - # Adding entry attributes to the allowed types for validation. for property in properties: property_name = property[0] @@ -825,7 +821,9 @@ def parse_schema( f"python identifier." ) - if property_name in valid_properties: + if property_name in set( + val[0] for val in self.allowed_types_tree[en.class_name] + ): warnings.warn( f"Attribute type for the entry {en.class_name} " f"and the attribute {property_name} already present in " From da76f925036de5c5c690add2b810603251d3bed5 Mon Sep 17 00:00:00 2001 From: Pushkar-Bhuse Date: Wed, 3 Aug 2022 20:34:31 -0700 Subject: [PATCH 14/20] added comments --- forte/data/data_store.py | 7 +++++++ forte/data/ontology/top.py | 5 +++++ 2 files changed, 12 insertions(+) diff --git a/forte/data/data_store.py b/forte/data/data_store.py index b7f059ae4..5ee8c955d 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -323,6 +323,13 @@ def __setstate__(self, state): info_dict = self._type_attributes[tn][ constants.ATTR_INFO_KEY ][attr] + + # If in case there is an attribute of entry + # referenced by tn which is defined in the + # _type_attributes dict of DataStore but not + # in the serialized data of _type_attributes, + # we dont need to add type information for + # that attribute. except KeyError: continue if constants.ATTR_TYPE_KEY not in info_dict: diff --git a/forte/data/ontology/top.py b/forte/data/ontology/top.py index 3dce620cf..b19224873 100644 --- a/forte/data/ontology/top.py +++ b/forte/data/ontology/top.py @@ -1273,6 +1273,11 @@ def __setstate__(self, state): else: state["_embedding"] = np.empty(0) + # Here we assume that if the payload is not text (in which case + # cache is stored a string), cache will always be stored as a + # numpy array (which is converted to a list during serialization). + # This check can be made more comprehensive when new types of + # payloads are introduced. if "_cache" in state and isinstance(state["_cache"], list): state["_cache"] = np.array(state["_cache"]) From 8184b9ddb8bd7e017a2491895e0ec75a872122ad Mon Sep 17 00:00:00 2001 From: Pushkar-Bhuse Date: Tue, 9 Aug 2022 14:36:55 -0700 Subject: [PATCH 15/20] Review changes and python version fix --- forte/data/base_pack.py | 38 +++- forte/data/data_store.py | 174 +++++++++++------- .../data/ontology/code_generation_objects.py | 16 +- .../data/ontology/ontology_code_generator.py | 11 +- .../data/data_store_serialization_test.py | 167 ++++++++--------- 5 files changed, 232 insertions(+), 174 deletions(-) diff --git a/forte/data/base_pack.py b/forte/data/base_pack.py index 0e6a5229f..0029a5606 100644 --- a/forte/data/base_pack.py +++ b/forte/data/base_pack.py @@ -458,12 +458,30 @@ def on_entry_creation( def entry_getter(cls: Entry, attr_name: str): """A getter function for dataclass fields of entry object. - When the field contains ``tid``s, we will convert them to entry - object on the fly. + Depending on the value stored in the data store and the type + of the attribute, the method decides how to process the value. + + - Attributes repersented as ``FList`` and ``FDict`` objects are stored + as list and dictionary respectively in the dtaa store entry. These + values are converted to ``FList`` and ``FDict`` objects on the fly. + - When the field contains ``tid``s, we will convert them to entry + object on the fly. This is done by checking the type + information of the attribute in the entry object. If the + attribute is of type ``Entry`` or a ``ForwardRef``, we can + assume that that value stored in the data store entry represents + the entry's ``tid``. + - When values are stored as a tuple, we assume the value represents + a `subentry` stored in a `MultiPack`. + - In all other cases, the values are returned in the forms that they + are stored in the data store entry. Args: cls: An ``Entry`` class object. attr_name: The name of the attribute. + + Returns: + The value of the required attribute in the form specified + by the corresponding ``Entry`` class object. """ data_store_ref = ( @@ -472,13 +490,13 @@ def entry_getter(cls: Entry, attr_name: str): attr_val = data_store_ref.get_attribute( tid=cls.tid, attr_name=attr_name ) - entry_type = data_store_ref.get_entry_types( + attr_type = data_store_ref.get_attr_type( cls.entry_type(), attr_name ) - if entry_type[0] in (FList, FDict): + if attr_type[0] in (FList, FDict): # Generate FList/FDict object on the fly - return entry_type[0](parent_entry=cls, data=attr_val) + return attr_type[0](parent_entry=cls, data=attr_val) try: # Check dataclass attribute value type # If the attribute was an Entry object, only its tid @@ -493,12 +511,12 @@ def entry_getter(cls: Entry, attr_name: str): # before returning. if ( isinstance(attr_val, int) - and entry_type[1] + and attr_type[1] and any( issubclass(entry, Entry) if isclass(entry) else is_forward_ref(entry) - for entry in list(entry_type[1]) + for entry in list(attr_type[1]) ) ): return cls.pack.get_entry(tid=attr_val) @@ -534,13 +552,13 @@ def entry_setter(cls: Entry, value: Any, attr_name: str): cls.pack._data_store # pylint: disable=protected-access ) - entry_type = data_store_ref.get_entry_types( + attr_type = data_store_ref.get_attr_type( cls.entry_type(), attr_name ) # Assumption: Users will not assign value to a FList/FDict field. # Only internal methods can set the FList/FDict field, and value's # type has to be Iterator[Entry]/Dict[Any, Entry]. - if entry_type[0] is FList: + if attr_type[0] is FList: try: attr_value = [entry.tid for entry in value] except AttributeError as e: @@ -548,7 +566,7 @@ def entry_setter(cls: Entry, value: Any, attr_name: str): "You are trying to assign value to a `FList` field, " "which can only accept an iterator of `Entry` objects." ) from e - elif entry_type[0] is FDict: + elif attr_type[0] is FDict: try: attr_value = { key: entry.tid for key, entry in value.items() diff --git a/forte/data/data_store.py b/forte/data/data_store.py index 5ee8c955d..accc4587a 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -19,7 +19,7 @@ import logging from heapq import heappush, heappop from sortedcontainers import SortedList -from typing_inspect import get_origin, get_args +from typing_inspect import get_origin, get_args, is_generic_type from forte.utils import get_class from forte.utils.utils import get_full_module_name @@ -158,19 +158,22 @@ def __init__( """ The ``_type_attributes`` is a private dictionary that provides - ``type_name``, their parent entry, and the order of corresponding attributes. + ``type_name``, their parent entry, and the metadata of corresponding attributes. + This metadata includes the order and type information of attributes. The keys are fully qualified names of every type; The value is a - dictionary with two keys. Key ``attribute`` provides an inner dictionary - with all valid attributes for this type and the information of attributes - among these lists. This information is represented as a dictionary. The - dictionary has two entries, the first is index which determines the position - where an attribute is stored in a data store entry. The second is type, which - is a tuple of two elements that provides the type information of a given - attribute. The first element is the unsubscripted version of - the attribute's type and the second element is the type arguments - for the same. The information represented in this dictionary is the index of - the attribute and the type of the variable it stores. Key ``parent_class`` is - a string representing the ancestors of this type. + dictionary with two keys. + + 1) Key ``attribute`` provides an inner dictionary + with all valid attributes for this type and the information of attributes + among these lists. This information is represented as a dictionary. The + dictionary has two entries: + a) the first is index which determines the position + where an attribute is stored in a data store entry. + b) The second is type, which is a tuple of two elements that provides the + type information of a given attribute. + i) The first element is the unsubscripted version of the attribute's type + ii) the second element is the type arguments for the same. + 2) Key ``parent_class`` is a string representing the ancestors of this type. This structure is supposed to be built dynamically. When a user adds new entries, `DataStore` will check unknown types and add them to @@ -180,33 +183,33 @@ def __init__( .. code-block:: python - # DataStore._type_attributes is: - # { - # "ft.onto.base_ontology.Document": { - # "attributes": { - # "document_class": {"index": 4, "type": (list, (str,))}, - # "sentiment": {"index": 5, "type": (dict, (str, float))}, - # "classifications": { - # "index": 6, - # "type":(FDict,(str, Classification)) - # } - # }, - # "parent_class": set(), - # }, - # "ft.onto.base_ontology.Sentence": { - # "attributes": { - # "speaker": {"index": 4, "type": (Union, (str, type(None)))}, - # "part_id": {"index": 5, "type": (Union, (int, type(None)))}, - # "sentiment": {"index": 6, "type": (dict, (str, float))}, - # "classification": {"index": 7, "type": (dict, (str, float))}, - # "classifications": { - # "index": 8, - # "type": (FDict,(str, Classification)) - # }, - # }, - # "parent_class": set(), - # }, - # } + DataStore._type_attributes is: + { + "ft.onto.base_ontology.Document": { + "attributes": { + "document_class": {"index": 4, "type": (list, (str,))}, + "sentiment": {"index": 5, "type": (dict, (str, float))}, + "classifications": { + "index": 6, + "type":(FDict,(str, Classification)) + } + }, + "parent_class": set(), + }, + "ft.onto.base_ontology.Sentence": { + "attributes": { + "speaker": {"index": 4, "type": (Union, (str, type(None)))}, + "part_id": {"index": 5, "type": (Union, (int, type(None)))}, + "sentiment": {"index": 6, "type": (dict, (str, float))}, + "classification": {"index": 7, "type": (dict, (str, float))}, + "classifications": { + "index": 8, + "type": (FDict,(str, Classification)) + }, + }, + "parent_class": set(), + }, + } """ self._init_top_to_core_entries() if self._onto_file_path: @@ -682,24 +685,23 @@ def _new_entry_types( self, type_name: str, attributes: Optional[Set[Tuple[str, str]]] = None ) -> Dict[str, Tuple]: r"""This function takes a fully qualified ``type_name`` class name - and creates a dictionary where the key is attribute of the entry - and value is the type information of that attribute. For example, - - .. code-block:: python - - type_dict = { - "document_class": (list, (str,)), - "sentiment": (dict, (str, float)), - "classifications": (FDict, (str, Classification)) - } - - For each attribute, the type information is represented by a tuple - of two elements. The first element is the unsubscripted version of - the attribute's type and the second element is the type arguments - for the same. The `type_dict` is used to populate the type - information for attributes of an entry specified by ``type_name`` - in `_type_attributes`. - + and a set of tuples representing an attribute and its required type + (only in the case where the ``type_name`` class name represents an + entry being added from a user defined ontology) and creates a + dictionary where the key is attribute of the entry and value is + the type information of that attribute. + + There are two cases in which a fully qualified ``type_name`` class + name can be handled: + + 1) If the class being added is of an existing entry: This means + that there is information present about this entry through + its `dataclass` attributes and their respective types. Thus, + we use the `_get_entry_attributes_by_class` method to fetch + this information. + 2) If the class being added is of a user defined entry: In this + case, we fetch the information about the entry's attributes + and their types from the ``attributes`` argument. Args: type_name: A fully qualified name of an entry class. @@ -713,6 +715,21 @@ def _new_entry_types( ('passage_id', 'str'), ('author', 'str') } + Returns: A dictionary representing attributes as key and type + information as value. For each attribute, the type information is + represented by a tuple of two elements. The first element is the + unsubscripted version of the attribute's type and the second + element is the type arguments for the same. The `type_dict` is used + to populate the type information for attributes of an entry + specified by ``type_name`` in `_type_attributes`. For example, + + .. code-block:: python + + type_dict = { + "document_class": (list, (str,)), + "sentiment": (dict, (str, float)), + "classifications": (FDict, (str, Classification)) + } """ type_dict = {} attr_class: Any @@ -735,16 +752,29 @@ def _new_entry_types( else: attr_fields: Dict = self._get_entry_attributes_by_class(type_name) for attr_name, attr_info in attr_fields.items(): - attr_class = get_origin(attr_info.type) - attr_args = get_args(attr_info.type) - - # Prior to Python 3.7, typing.List and typing.Dict - # is not converted to primitive forms of list and + # Prior to Python 3.7, fetching generic type + # aliases resulted in actual type objects whereas from + # Python 3.7, they were converted to their primitive + # form. For example, typing.List and typing.Dict + # is converted to primitive forms of list and # dict. We handle them separately here - if attr_class == Dict: - attr_class = dict - if attr_class == List: - attr_class = list + if is_generic_type(attr_info.type): + try: + # if python version is < 3.7, thr primitive form + # of generic types are stored in the __extra__ + # attribute. This attribute is not present in + # generic types from 3.7. + attr_class = attr_info.type.__extra__ + except AttributeError: + # if python version is < 3.7, thr primitive form + # of generic types are stored in the __origin__ + # attribute. + attr_class = attr_info.type.__origin__ + pass + else: + attr_class = get_origin(attr_info.type) + + attr_args = get_args(attr_info.type) type_dict[attr_name] = tuple([attr_class, attr_args]) @@ -836,11 +866,11 @@ def _is_annotation(self, type_name: str) -> bool: for entry_class in (Annotation, AudioAnnotation) ) - def get_entry_types( + def get_attr_type( self, type_name: str, attr_name: str ) -> Tuple[Any, Tuple]: """ - Retrieve the entry type of a given attribute ``attr_name`` + Retrieve the type information of a given attribute ``attr_name`` in an entry of type ``type_name`` Args: @@ -1809,7 +1839,11 @@ def _parse_onto_file(self): children = entry_tree.root.children while len(children) > 0: # entry_node represents a node in the ontology tree - # generated by parsing an existing ontology file + # generated by parsing an existing ontology file. + # The entry_node the information of the entry + # represented by this node. It also stores the name + # and the type information of the attributes of the + # entry represented by this node. entry_node = children.pop(0) children.extend(entry_node.children) diff --git a/forte/data/ontology/code_generation_objects.py b/forte/data/ontology/code_generation_objects.py index c28649f85..f8315ab02 100644 --- a/forte/data/ontology/code_generation_objects.py +++ b/forte/data/ontology/code_generation_objects.py @@ -17,7 +17,7 @@ import warnings from abc import ABC from pathlib import Path -from typing import Optional, Any, List, Dict, Set, Tuple +from typing import Optional, Any, List, Dict, Set, Tuple, cast from numpy import ndarray from forte.data.ontology.code_generation_exceptions import ( @@ -797,7 +797,7 @@ def __init__(self, name: str): self.children: List[EntryTreeNode] = [] self.parent: Optional[EntryTreeNode] = None self.name: str = name - self.attributes: Set[Tuple] = set() + self.attributes: Set[Tuple[str, str]] = set() def __repr__(self): r"""for printing purpose.""" @@ -817,7 +817,7 @@ def add_node( self, curr_entry_name: str, parent_entry_name: str, - curr_entry_attr: Set[Tuple], + curr_entry_attr: Set[Tuple[str, str]], ): r"""Add a tree node with `curr_entry_name` as a child to `parent_entry_name` in the tree, the attributes `curr_entry_attr` @@ -856,7 +856,9 @@ def collect_parents(self, node_dict: Dict[str, Set[str]]): Args: node_dict: the nodes dictionary of nodes to collect parent nodes - for. + for. The entry represented by nodes in this dictionary do not store + type information of its attributes. This dictionary does not store + the type information of the nodes. """ input_node_dict = node_dict.copy() @@ -907,14 +909,16 @@ def fromdict( if parent_entry_name is None: self.root = EntryTreeNode(name=tree_dict["name"]) self.root.attributes = set( - tuple(attr) for attr in tree_dict["attributes"] + cast(Tuple[str, str], tuple(attr)) + for attr in tree_dict["attributes"] ) else: self.add_node( curr_entry_name=tree_dict["name"], parent_entry_name=parent_entry_name, curr_entry_attr=set( - tuple(attr) for attr in tree_dict["attributes"] + cast(Tuple[str, str], tuple(attr)) + for attr in tree_dict["attributes"] ), ) for child in tree_dict["children"]: diff --git a/forte/data/ontology/ontology_code_generator.py b/forte/data/ontology/ontology_code_generator.py index 88a6956af..80d0db784 100644 --- a/forte/data/ontology/ontology_code_generator.py +++ b/forte/data/ontology/ontology_code_generator.py @@ -269,7 +269,7 @@ def __init__( # Adjacency list to store the allowed types (in-built or user-defined), # and their attributes (if any) in order to validate the attribute # types. - self.allowed_types_tree: Dict[str, Set[Tuple]] = {} + self.allowed_types_tree: Dict[str, Set] = {} for type_str in ALL_INBUILT_TYPES: self.allowed_types_tree[type_str] = set() @@ -830,7 +830,7 @@ def parse_schema( f"the ontology, will be overridden", DuplicatedAttributesWarning, ) - self.allowed_types_tree[en.class_name].add(tuple(property)) + self.allowed_types_tree[en.class_name].add(property) # populate the entry tree based on information if merged_entry_tree is not None: curr_entry_name = en.class_name @@ -971,15 +971,16 @@ def construct_init(self, entry_name: EntryName, base_entry: str): def parse_entry( self, entry_name: EntryName, schema: Dict - ) -> Tuple[EntryDefinition, List[Tuple[Any, Any]]]: + ) -> Tuple[EntryDefinition, List[Tuple[str, str]]]: """ Args: entry_name: Object holds various name form of the entry. schema: Dictionary containing specifications for an entry. Returns: extracted entry information: entry package string, entry - filename, entry class entry_name, generated entry code and entry - attribute names. + filename, entry class entry_name, generated entry code and a list + of tuples where each element in the list represents the an attribute + in the entry and its corresponding type. """ this_manager = self.import_managers.get(entry_name.module_name) diff --git a/tests/forte/data/data_store_serialization_test.py b/tests/forte/data/data_store_serialization_test.py index dbe9b7be4..5fb33ce72 100644 --- a/tests/forte/data/data_store_serialization_test.py +++ b/tests/forte/data/data_store_serialization_test.py @@ -39,7 +39,6 @@ def setUp(self) -> None: DataStore._type_attributes = { "ft.onto.base_ontology.Document": { "attributes": { - # "document_class": {"index": 4, "type": (list, (str,))}, "sentiment": {"index": 4, "type": (dict, (str, float))}, "classifications": { "index": 5, @@ -86,7 +85,7 @@ def setUp(self) -> None: 1234, "ft.onto.base_ontology.Document", "Positive", - {"Negative": 0}, + None, ], [ 10, @@ -94,7 +93,7 @@ def setUp(self) -> None: 3456, "ft.onto.base_ontology.Document", "Negative", - {}, + "Class B", ], [ 15, @@ -102,7 +101,7 @@ def setUp(self) -> None: 4567, "ft.onto.base_ontology.Document", "Positive", - {"Negative": 0}, + "Class C", ], [ 20, @@ -110,15 +109,15 @@ def setUp(self) -> None: 5678, "ft.onto.base_ontology.Document", "Neutral", - {}, + "Class D", ], [ 40, - 55, + 55, 7890, "ft.onto.base_ontology.Document", "Very Positive", - {"Positive": 0}, + "Class E", ], ], ), @@ -129,23 +128,23 @@ def setUp(self) -> None: 9, 9999, "ft.onto.base_ontology.Sentence", - {}, + "Positive", "teacher", 1, - {"Negative": 0}, - {}, - "abc", + None, + None, + "cba", ], [ 55, 70, 1234567, "ft.onto.base_ontology.Sentence", - {}, - "student", - 2, - {"Positive": 0}, - {}, + "Negative", + None, + None, + "Class C", + "Class D", "abc", ], [ @@ -153,24 +152,24 @@ def setUp(self) -> None: 90, 100, "ft.onto.base_ontology.Sentence", - {}, - "teacher", + "Positive", + "student", 2, - {"Positive": 0}, - {}, - "cba", + "testA", + "class1", + "bad", ], [ 65, 90, 5000, "ft.onto.base_ontology.Sentence", - {}, + "Positive", "TA", - 1, - {"Positive": 0}, - {}, - "bad", + 3, + "testB", + "class2", + "good", ], ], ), @@ -307,7 +306,7 @@ def test_save_attribute_pickle(self): "ft.onto.base_ontology.Document", None, "Positive", - {"Negative": 0}, + None, ], [ 10, @@ -316,7 +315,7 @@ def test_save_attribute_pickle(self): "ft.onto.base_ontology.Document", None, "Negative", - {}, + "Class B", ], [ 15, @@ -325,7 +324,7 @@ def test_save_attribute_pickle(self): "ft.onto.base_ontology.Document", None, "Positive", - {"Negative": 0}, + "Class C", ], [ 20, @@ -334,7 +333,7 @@ def test_save_attribute_pickle(self): "ft.onto.base_ontology.Document", None, "Neutral", - {}, + "Class D", ], [ 40, @@ -343,7 +342,7 @@ def test_save_attribute_pickle(self): "ft.onto.base_ontology.Document", None, "Very Positive", - {"Positive": 0}, + "Class E", ], ], ), @@ -356,31 +355,31 @@ def test_save_attribute_pickle(self): "ft.onto.base_ontology.Sentence", "teacher", 1, - {}, + "Positive", + None, None, - {}, ], [ 55, 70, 1234567, "ft.onto.base_ontology.Sentence", - "student", - 2, - {}, None, - {}, + None, + "Negative", + None, + "Class D", ], [ 60, 90, 100, "ft.onto.base_ontology.Sentence", - "teacher", + "student", 2, - {}, + "Positive", None, - {}, + "class1", ], [ 65, @@ -388,10 +387,10 @@ def test_save_attribute_pickle(self): 5000, "ft.onto.base_ontology.Sentence", "TA", - 1, - {}, + 3, + "Positive", None, - {}, + "class2", ], ], ), @@ -476,7 +475,7 @@ def test_save_attribute_pickle(self): 1234, "ft.onto.base_ontology.Document", "Positive", - {"Negative": 0}, + None, ], [ 10, @@ -484,7 +483,7 @@ def test_save_attribute_pickle(self): 3456, "ft.onto.base_ontology.Document", "Negative", - {}, + "Class B", ], [ 15, @@ -492,7 +491,7 @@ def test_save_attribute_pickle(self): 4567, "ft.onto.base_ontology.Document", "Positive", - {"Negative": 0}, + "Class C", ], [ 20, @@ -500,7 +499,7 @@ def test_save_attribute_pickle(self): 5678, "ft.onto.base_ontology.Document", "Neutral", - {}, + "Class D", ], [ 40, @@ -508,7 +507,7 @@ def test_save_attribute_pickle(self): 7890, "ft.onto.base_ontology.Document", "Very Positive", - {"Positive": 0}, + "Class E", ], ], ), @@ -519,48 +518,49 @@ def test_save_attribute_pickle(self): 9, 9999, "ft.onto.base_ontology.Sentence", - {}, + "Positive", "teacher", 1, - {"Negative": 0}, - {}, - "abc", + None, + None, + "cba", ], [ 55, 70, 1234567, "ft.onto.base_ontology.Sentence", - {}, - "student", - 2, - {"Positive": 0}, - {}, + "Negative", + None, + None, + "Class C", + "Class D", "abc", + ], [ 60, 90, 100, "ft.onto.base_ontology.Sentence", - {}, - "teacher", + "Positive", + "student", 2, - {"Positive": 0}, - {}, - "cba", + "testA", + "class1", + "bad", ], [ 65, 90, 5000, "ft.onto.base_ontology.Sentence", - {}, + "Positive", "TA", - 1, - {"Positive": 0}, - {}, - "bad", + 3, + "testB", + "class2", + "good", ], ], ), @@ -810,7 +810,7 @@ def test_delete_serialize(self): "ft.onto.base_ontology.Document", None, "Positive", - {"Negative": 0}, + None, ], [ 10, @@ -819,7 +819,7 @@ def test_delete_serialize(self): "ft.onto.base_ontology.Document", None, "Negative", - {}, + "Class B", ], [ 20, @@ -828,7 +828,7 @@ def test_delete_serialize(self): "ft.onto.base_ontology.Document", None, "Neutral", - {}, + "Class D", ], [ 40, @@ -837,7 +837,7 @@ def test_delete_serialize(self): "ft.onto.base_ontology.Document", None, "Very Positive", - {"Positive": 0}, + "Class E", ], ], ), @@ -850,31 +850,31 @@ def test_delete_serialize(self): "ft.onto.base_ontology.Sentence", "teacher", 1, - {}, + "Positive", + None, None, - {}, ], [ 55, 70, 1234567, "ft.onto.base_ontology.Sentence", - "student", - 2, - {}, None, - {}, + None, + "Negative", + None, + "Class D", ], [ 60, 90, 100, "ft.onto.base_ontology.Sentence", - "teacher", + "student", 2, - {}, + "Positive", None, - {}, + "class1", ], [ 65, @@ -882,10 +882,11 @@ def test_delete_serialize(self): 5000, "ft.onto.base_ontology.Sentence", "TA", - 1, - {}, + 3, + "Positive", None, - {}, + "class2", + ], ], ), From a48d5d5b5d33fa020e3bf7de09059fecca89c2b2 Mon Sep 17 00:00:00 2001 From: Pushkar-Bhuse Date: Tue, 9 Aug 2022 17:17:23 -0700 Subject: [PATCH 16/20] test commit --- forte/data/data_store.py | 28 +++++++++++++++------------- tests/forte/data/data_store_test.py | 7 +++++++ 2 files changed, 22 insertions(+), 13 deletions(-) diff --git a/forte/data/data_store.py b/forte/data/data_store.py index accc4587a..72e3bd33a 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -13,7 +13,9 @@ from copy import deepcopy import json +import sys from typing import Dict, List, Iterator, Set, Tuple, Optional, Any, Type +from inspect import isclass import uuid import logging @@ -752,29 +754,29 @@ def _new_entry_types( else: attr_fields: Dict = self._get_entry_attributes_by_class(type_name) for attr_name, attr_info in attr_fields.items(): + + attr_class = get_origin(attr_info.type) + attr_args = get_args(attr_info.type) + # Prior to Python 3.7, fetching generic type # aliases resulted in actual type objects whereas from # Python 3.7, they were converted to their primitive # form. For example, typing.List and typing.Dict # is converted to primitive forms of list and # dict. We handle them separately here - if is_generic_type(attr_info.type): + if is_generic_type(attr_info.type) and sys.version_info[:3] < ( + 3, + 7, + 0, + ): + # if python version is < 3.7, thr primitive form + # of generic types are stored in the __extra__ + # attribute. This attribute is not present in + # generic types from 3.7. try: - # if python version is < 3.7, thr primitive form - # of generic types are stored in the __extra__ - # attribute. This attribute is not present in - # generic types from 3.7. attr_class = attr_info.type.__extra__ except AttributeError: - # if python version is < 3.7, thr primitive form - # of generic types are stored in the __origin__ - # attribute. - attr_class = attr_info.type.__origin__ pass - else: - attr_class = get_origin(attr_info.type) - - attr_args = get_args(attr_info.type) type_dict[attr_name] = tuple([attr_class, attr_args]) diff --git a/tests/forte/data/data_store_test.py b/tests/forte/data/data_store_test.py index 806be4747..35c9ab3d4 100644 --- a/tests/forte/data/data_store_test.py +++ b/tests/forte/data/data_store_test.py @@ -335,10 +335,17 @@ def test_get_type_info(self): ) empty_data_store._get_type_info("ft.onto.base_ontology.Sentence") self.assertEqual(len(empty_data_store._DataStore__elements), 0) + + print(DataStore._type_attributes["ft.onto.base_ontology.Sentence"]) self.assertEqual( DataStore._type_attributes["ft.onto.base_ontology.Sentence"], self.reference_type_attributes["ft.onto.base_ontology.Sentence"], ) +<<<<<<< Updated upstream + +======= + +>>>>>>> Stashed changes self.assertEqual( DataStore._type_attributes["ft.onto.base_ontology.Document"], self.reference_type_attributes["ft.onto.base_ontology.Document"], From 02e1a37824ae4e304b40b4648e3f0c190e3e2883 Mon Sep 17 00:00:00 2001 From: Pushkar-Bhuse Date: Wed, 10 Aug 2022 12:05:40 -0700 Subject: [PATCH 17/20] entry data structures check --- forte/data/base_pack.py | 3 ++- forte/data/data_store.py | 17 +++++++++++------ forte/data/ontology/core.py | 2 ++ tests/forte/data/data_store_test.py | 8 +------- 4 files changed, 16 insertions(+), 14 deletions(-) diff --git a/forte/data/base_pack.py b/forte/data/base_pack.py index 0029a5606..182231d42 100644 --- a/forte/data/base_pack.py +++ b/forte/data/base_pack.py @@ -48,6 +48,7 @@ LinkType, FList, FDict, + ENTRY_TYPE_DATA_STRUCTURES, ) from forte.version import ( PACK_VERSION, @@ -494,7 +495,7 @@ def entry_getter(cls: Entry, attr_name: str): cls.entry_type(), attr_name ) - if attr_type[0] in (FList, FDict): + if attr_type[0] in ENTRY_TYPE_DATA_STRUCTURES: # Generate FList/FDict object on the fly return attr_type[0](parent_entry=cls, data=attr_val) try: diff --git a/forte/data/data_store.py b/forte/data/data_store.py index 72e3bd33a..0a78b4bde 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -15,7 +15,6 @@ import json import sys from typing import Dict, List, Iterator, Set, Tuple, Optional, Any, Type -from inspect import isclass import uuid import logging @@ -40,7 +39,12 @@ MultiPackGroup, MultiPackLink, ) -from forte.data.ontology.core import Entry, FList, FDict +from forte.data.ontology.core import ( + Entry, + FList, + FDict, + ENTRY_TYPE_DATA_STRUCTURES, +) from forte.common import constants @@ -764,10 +768,11 @@ def _new_entry_types( # form. For example, typing.List and typing.Dict # is converted to primitive forms of list and # dict. We handle them separately here - if is_generic_type(attr_info.type) and sys.version_info[:3] < ( - 3, - 7, - 0, + if ( + is_generic_type(attr_info.type) + and hasattr(attr_info.type, "__extra__") + and sys.version_info[:3] < (3, 7, 0) + and attr_class not in ENTRY_TYPE_DATA_STRUCTURES ): # if python version is < 3.7, thr primitive form # of generic types are stored in the __extra__ diff --git a/forte/data/ontology/core.py b/forte/data/ontology/core.py index 63802bdfb..22ecbba96 100644 --- a/forte/data/ontology/core.py +++ b/forte/data/ontology/core.py @@ -864,3 +864,5 @@ def __hash__(self): GroupType = TypeVar("GroupType", bound=BaseGroup) LinkType = TypeVar("LinkType", bound=BaseLink) + +ENTRY_TYPE_DATA_STRUCTURES = (FDict, FList) diff --git a/tests/forte/data/data_store_test.py b/tests/forte/data/data_store_test.py index 35c9ab3d4..87cde3f34 100644 --- a/tests/forte/data/data_store_test.py +++ b/tests/forte/data/data_store_test.py @@ -335,17 +335,11 @@ def test_get_type_info(self): ) empty_data_store._get_type_info("ft.onto.base_ontology.Sentence") self.assertEqual(len(empty_data_store._DataStore__elements), 0) - - print(DataStore._type_attributes["ft.onto.base_ontology.Sentence"]) + self.assertEqual( DataStore._type_attributes["ft.onto.base_ontology.Sentence"], self.reference_type_attributes["ft.onto.base_ontology.Sentence"], ) -<<<<<<< Updated upstream - -======= - ->>>>>>> Stashed changes self.assertEqual( DataStore._type_attributes["ft.onto.base_ontology.Document"], self.reference_type_attributes["ft.onto.base_ontology.Document"], From 75aa6ec316cf78d9c973bd0162c71fd631b8cf78 Mon Sep 17 00:00:00 2001 From: Pushkar-Bhuse Date: Wed, 10 Aug 2022 14:14:26 -0700 Subject: [PATCH 18/20] review changes --- forte/data/data_store.py | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/forte/data/data_store.py b/forte/data/data_store.py index 0a78b4bde..ebb06a855 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -164,18 +164,18 @@ def __init__( """ The ``_type_attributes`` is a private dictionary that provides - ``type_name``, their parent entry, and the metadata of corresponding attributes. - This metadata includes the order and type information of attributes. - The keys are fully qualified names of every type; The value is a - dictionary with two keys. - - 1) Key ``attribute`` provides an inner dictionary - with all valid attributes for this type and the information of attributes - among these lists. This information is represented as a dictionary. The - dictionary has two entries: - a) the first is index which determines the position - where an attribute is stored in a data store entry. - b) The second is type, which is a tuple of two elements that provides the + ``type_name``, their parent entry, and the metadata of its corresponding + attributes. This metadata includes the order and type information of attributes + stored in the data store entry. The keys are fully qualified names of every entry + type; The value is a dictionary with two keys. + + 1) Key ``attribute`` has its value as a dictionary + with all valid attributes for this entry type as keys and their metadata. + as values. The metadata is represented as another inner dictionary + that has two keys: + a) the first key is ``index`` whose value determines the position + of where the attribute is stored in the data store entry. + b) The second key is type, which is a tuple of two elements that provides the type information of a given attribute. i) The first element is the unsubscripted version of the attribute's type ii) the second element is the type arguments for the same. @@ -199,7 +199,7 @@ def __init__( "index": 6, "type":(FDict,(str, Classification)) } - }, + }, "parent_class": set(), }, "ft.onto.base_ontology.Sentence": { @@ -212,7 +212,7 @@ def __init__( "index": 8, "type": (FDict,(str, Classification)) }, - }, + }, "parent_class": set(), }, } @@ -326,7 +326,7 @@ def __setstate__(self, state): # Update `_type_attributes` to store the types of each # entry attribute as well. for tn in self._type_attributes: - entry_type = self._new_entry_types(tn) + entry_type = self.fetch_entry_type_data(tn) for attr, type_val in entry_type.items(): try: info_dict = self._type_attributes[tn][ @@ -612,7 +612,7 @@ def _get_type_info(self, type_name: str) -> Dict[str, Any]: attr_dict = {} attr_idx = constants.ENTRY_TYPE_INDEX + 1 - type_dict = self._new_entry_types(type_name) + type_dict = self.fetch_entry_type_data(type_name) for attr_name in attributes: attr_dict[attr_name] = { @@ -687,7 +687,7 @@ def _default_attributes_for_type(self, type_name: str) -> List: attr_list[attr_id - constants.ATTR_BEGIN_INDEX] = {} return attr_list - def _new_entry_types( + def fetch_entry_type_data( self, type_name: str, attributes: Optional[Set[Tuple[str, str]]] = None ) -> Dict[str, Tuple]: r"""This function takes a fully qualified ``type_name`` class name @@ -1860,7 +1860,9 @@ def _parse_onto_file(self): attr_dict = {} idx = constants.ATTR_BEGIN_INDEX - type_dict = self._new_entry_types(entry_name, entry_node.attributes) + type_dict = self.fetch_entry_type_data( + entry_name, entry_node.attributes + ) # sort the attribute dictionary for d in sorted(entry_node.attributes): From 39b4de96d625ea3231b35432847bc2f467b27aaa Mon Sep 17 00:00:00 2001 From: Pushkar-Bhuse Date: Wed, 10 Aug 2022 16:39:01 -0700 Subject: [PATCH 19/20] docstrings --- forte/data/data_store.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/forte/data/data_store.py b/forte/data/data_store.py index ebb06a855..ce14b2546 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -177,7 +177,7 @@ def __init__( of where the attribute is stored in the data store entry. b) The second key is type, which is a tuple of two elements that provides the type information of a given attribute. - i) The first element is the unsubscripted version of the attribute's type + i) The first element is the `unsubscripted` version of the attribute's type ii) the second element is the type arguments for the same. 2) Key ``parent_class`` is a string representing the ancestors of this type. @@ -724,7 +724,7 @@ def fetch_entry_type_data( Returns: A dictionary representing attributes as key and type information as value. For each attribute, the type information is represented by a tuple of two elements. The first element is the - unsubscripted version of the attribute's type and the second + `unsubscripted` version of the attribute's type and the second element is the type arguments for the same. The `type_dict` is used to populate the type information for attributes of an entry specified by ``type_name`` in `_type_attributes`. For example, From 7172ce287cb7a5a20a2308a65e7f019116143472 Mon Sep 17 00:00:00 2001 From: Pushkar-Bhuse Date: Fri, 12 Aug 2022 11:34:55 -0700 Subject: [PATCH 20/20] improve docstring --- forte/data/data_store.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/forte/data/data_store.py b/forte/data/data_store.py index ce14b2546..1f959c774 100644 --- a/forte/data/data_store.py +++ b/forte/data/data_store.py @@ -164,10 +164,10 @@ def __init__( """ The ``_type_attributes`` is a private dictionary that provides - ``type_name``, their parent entry, and the metadata of its corresponding - attributes. This metadata includes the order and type information of attributes - stored in the data store entry. The keys are fully qualified names of every entry - type; The value is a dictionary with two keys. + ``type_name`` as the key, and the metadata of the entry represented by + ``type_name``. This metadata includes the order and type information of + attributes stored in the data store entry; The value is a dictionary with + two keys. 1) Key ``attribute`` has its value as a dictionary with all valid attributes for this entry type as keys and their metadata.