Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use "describe table" to get the columns in a relation on snowflake (#2260) #2324

Merged
merged 2 commits into from
Apr 16, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
- Added support for `db_groups` and `autocreate` flags in Redshift configurations. ([#1995](https:/fishtown-analytics/dbt/issues/1995), [#2262](https:/fishtown-analytics/dbt/pull/2262))
- Users can supply paths as arguments to `--models` and `--select`, either explicitily by prefixing with `path:` or implicitly with no prefix. ([#454](https:/fishtown-analytics/dbt/issues/454), [#2258](https:/fishtown-analytics/dbt/pull/2258))
- dbt now builds the relation cache for "dbt compile" and "dbt ls" as well as "dbt run" ([#1705](https:/fishtown-analytics/dbt/issues/1705), [#2319](https:/fishtown-analytics/dbt/pull/2319))
- Snowflake now uses "describe table" to get the columns in a relation ([#2260](https:/fishtown-analytics/dbt/issues/2260), [#2324](https:/fishtown-analytics/dbt/pull/2324))

### Fixes
- When a jinja value is undefined, give a helpful error instead of failing with cryptic "cannot pickle ParserMacroCapture" errors ([#2110](https:/fishtown-analytics/dbt/issues/2110), [#2184](https:/fishtown-analytics/dbt/pull/2184))
Expand Down
47 changes: 46 additions & 1 deletion core/dbt/adapters/base/column.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from dataclasses import dataclass
import re

from hologram import JsonSchemaMixin
from dbt.exceptions import RuntimeException

from typing import Dict, ClassVar, Any, Optional

Expand Down Expand Up @@ -74,7 +76,7 @@ def is_numeric(self) -> bool:

def string_size(self) -> int:
if not self.is_string():
raise RuntimeError("Called string_size() on non-string field!")
raise RuntimeException("Called string_size() on non-string field!")

if self.dtype == 'text' or self.char_size is None:
# char_size should never be None. Handle it reasonably just in case
Expand Down Expand Up @@ -108,3 +110,46 @@ def numeric_type(cls, dtype: str, precision: Any, scale: Any) -> str:

def __repr__(self) -> str:
return "<Column {} ({})>".format(self.name, self.data_type)

@classmethod
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

While we're in here, can we override the string_size on the SnowflakeColumn class? I think right now, the base logic looks like:

    def string_size(self) -> int:
        if not self.is_string():
            raise RuntimeException("Called string_size() on non-string field!")

        if self.dtype == 'text' or self.char_size is None:
            # char_size should never be None. Handle it reasonably just in case
            return 256
        else:
            return int(self.char_size)

But on Snowflake, we want to map TEXT, STRING, or VARCHAR (with no size specified) to VARCHAR(MAX), which is 16777216.

def from_description(cls, name: str, raw_data_type: str) -> 'Column':
match = re.match(r'([^(]+)(\([^)]+\))?', raw_data_type)
if match is None:
raise RuntimeException(
f'Could not interpret data type "{raw_data_type}"'
)
data_type, size_info = match.groups()
char_size = None
numeric_precision = None
numeric_scale = None
if size_info is not None:
# strip out the parentheses
size_info = size_info[1:-1]
parts = size_info.split(',')
if len(parts) == 1:
try:
char_size = int(parts[0])
except ValueError:
raise RuntimeException(
f'Could not interpret data_type "{raw_data_type}": '
f'could not convert "{parts[0]}" to an integer'
)
elif len(parts) == 2:
try:
numeric_precision = int(parts[0])
except ValueError:
raise RuntimeException(
f'Could not interpret data_type "{raw_data_type}": '
f'could not convert "{parts[0]}" to an integer'
)
try:
numeric_scale = int(parts[1])
except ValueError:
raise RuntimeException(
f'Could not interpret data_type "{raw_data_type}": '
f'could not convert "{parts[1]}" to an integer'
)

return cls(
name, data_type, char_size, numeric_precision, numeric_scale
)
10 changes: 10 additions & 0 deletions plugins/snowflake/dbt/adapters/snowflake/column.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from dataclasses import dataclass

from dbt.adapters.base.column import Column
from dbt.exceptions import RuntimeException


@dataclass
Expand All @@ -19,3 +20,12 @@ def is_float(self):
return self.dtype.lower() in [
'float', 'float4', 'float8', 'double', 'double precision', 'real',
]

def string_size(self) -> int:
if not self.is_string():
raise RuntimeException("Called string_size() on non-string field!")

if self.dtype == 'text' or self.char_size is None:
return 16777216
else:
return int(self.char_size)
40 changes: 17 additions & 23 deletions plugins/snowflake/dbt/include/snowflake/macros/adapters.sql
Original file line number Diff line number Diff line change
Expand Up @@ -52,31 +52,25 @@
{% endmacro %}

{% macro snowflake__get_columns_in_relation(relation) -%}
{% call statement('get_columns_in_relation', fetch_result=True) %}
select
column_name,
data_type,
character_maximum_length,
numeric_precision,
numeric_scale

from
{{ relation.information_schema('columns') }}

where table_name ilike '{{ relation.identifier }}'
{% if relation.schema %}
and table_schema ilike '{{ relation.schema }}'
{% endif %}
{% if relation.database %}
and table_catalog ilike '{{ relation.database }}'
{% endif %}
order by ordinal_position
{%- set sql -%}
describe table {{ relation }}
{%- endset -%}
{%- set result = run_query(sql) -%}

{% endcall %}

{% set table = load_result('get_columns_in_relation').table %}
{{ return(sql_convert_columns_in_relation(table)) }}
{% set maximum = 10000 %}
{% if (result | length) >= maximum %}
{% set msg %}
Too many columns in relation {{ relation }}! dbt can only get
information about relations with fewer than {{ maximum }} columns.
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

i hope nobody ever sees this error message :p

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually don't know for sure that this is a limit, I just assumed it based on all the other commands. I'm comfortable saying that if someone has 10k columns in a table, we can revisit this then :)

{% endset %}
{% do exceptions.raise_compiler_error(msg) %}
{% endif %}

{% set columns = [] %}
{% for row in result %}
{% do columns.append(api.Column.from_description(row['name'], row['type'])) %}
{% endfor %}
{% do return(columns) %}
{% endmacro %}

{% macro snowflake__list_schemas(database) -%}
Expand Down
70 changes: 70 additions & 0 deletions test/unit/test_snowflake_adapter.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import dbt.flags as flags

from dbt.adapters.snowflake import SnowflakeAdapter
from dbt.adapters.snowflake.column import SnowflakeColumn
from dbt.adapters.base.query_headers import MacroQueryStringSetter
from dbt.clients import agate_helper
from dbt.logger import GLOBAL_LOGGER as logger # noqa
Expand Down Expand Up @@ -449,3 +450,72 @@ def test_convert_time_type(self):
expected = ['time', 'time', 'time']
for col_idx, expect in enumerate(expected):
assert SnowflakeAdapter.convert_time_type(agate_table, col_idx) == expect


class TestSnowflakeColumn(unittest.TestCase):
def test_text_from_description(self):
col = SnowflakeColumn.from_description('my_col', 'TEXT')
assert col.column == 'my_col'
assert col.dtype == 'TEXT'
assert col.char_size is None
assert col.numeric_precision is None
assert col.numeric_scale is None
assert col.is_float() is False
assert col.is_number() is False
assert col.is_numeric() is False
assert col.is_string() is True
assert col.is_integer() is False
assert col.string_size() == 16777216

col = SnowflakeColumn.from_description('my_col', 'VARCHAR')
assert col.column == 'my_col'
assert col.dtype == 'VARCHAR'
assert col.char_size is None
assert col.numeric_precision is None
assert col.numeric_scale is None
assert col.is_float() is False
assert col.is_number() is False
assert col.is_numeric() is False
assert col.is_string() is True
assert col.is_integer() is False
assert col.string_size() == 16777216

def test_sized_varchar_from_description(self):
col = SnowflakeColumn.from_description('my_col', 'VARCHAR(256)')
assert col.column == 'my_col'
assert col.dtype == 'VARCHAR'
assert col.char_size == 256
assert col.numeric_precision is None
assert col.numeric_scale is None
assert col.is_float() is False
assert col.is_number() is False
assert col.is_numeric() is False
assert col.is_string() is True
assert col.is_integer() is False
assert col.string_size() == 256

def test_sized_decimal_from_description(self):
col = SnowflakeColumn.from_description('my_col', 'DECIMAL(1, 0)')
assert col.column == 'my_col'
assert col.dtype == 'DECIMAL'
assert col.char_size is None
assert col.numeric_precision == 1
assert col.numeric_scale == 0
assert col.is_float() is False
assert col.is_number() is True
assert col.is_numeric() is True
assert col.is_string() is False
assert col.is_integer() is False

def test_float_from_description(self):
col = SnowflakeColumn.from_description('my_col', 'FLOAT8')
assert col.column == 'my_col'
assert col.dtype == 'FLOAT8'
assert col.char_size is None
assert col.numeric_precision is None
assert col.numeric_scale is None
assert col.is_float() is True
assert col.is_number() is True
assert col.is_numeric() is False
assert col.is_string() is False
assert col.is_integer() is False