From bfd5a737e436e71d31fb8893e2a0d1987d58e1b6 Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Thu, 25 Apr 2024 18:54:39 +0530 Subject: [PATCH 1/7] Add pipeline to collect pygments symbols Signed-off-by: Keshav Priyadarshi --- .../pipelines/collect_pygments_symbols.py | 46 +++++++++++++++++++ scanpipe/pipes/symbols.py | 34 ++++++++++++++ .../tests/data/source-inspector/test3.cpp | 23 ++++++++++ .../test3.cpp-pygments-expected.json | 25 ++++++++++ scanpipe/tests/pipes/test_symbols.py | 25 ++++++++++ setup.cfg | 3 +- 6 files changed, 155 insertions(+), 1 deletion(-) create mode 100644 scanpipe/pipelines/collect_pygments_symbols.py create mode 100644 scanpipe/tests/data/source-inspector/test3.cpp create mode 100644 scanpipe/tests/data/source-inspector/test3.cpp-pygments-expected.json diff --git a/scanpipe/pipelines/collect_pygments_symbols.py b/scanpipe/pipelines/collect_pygments_symbols.py new file mode 100644 index 000000000..59dec9976 --- /dev/null +++ b/scanpipe/pipelines/collect_pygments_symbols.py @@ -0,0 +1,46 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/nexB/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/nexB/scancode.io for support and download. + +from scanpipe.pipelines import Pipeline +from scanpipe.pipes import symbols + + +class CollectPygmentsSymbolsAndStrings(Pipeline): + """ + Collect codebase symbols using pygments and keep them in extra data field. + + Also collect strings and comments. + """ + + download_inputs = False + is_addon = False + + @classmethod + def steps(cls): + return (cls.collect_and_store_pygments_symbols_and_strings,) + + def collect_and_store_pygments_symbols_and_strings(self): + """ + Collect symbols, strings and comments from codebase files using pygments + and store them in the extra data field. + """ + symbols.collect_and_store_pygments_symbols_and_strings(self.project, self.log) diff --git a/scanpipe/pipes/symbols.py b/scanpipe/pipes/symbols.py index 1feb15e63..2ea3c9e81 100644 --- a/scanpipe/pipes/symbols.py +++ b/scanpipe/pipes/symbols.py @@ -21,6 +21,7 @@ # Visit https://github.com/nexB/scancode.io for support and download. from source_inspector import symbols_ctags +from source_inspector import symbols_pygments from scanpipe.pipes import LoopProgress @@ -65,3 +66,36 @@ def _collect_and_store_resource_symbols(resource): symbols = symbols_ctags.collect_symbols(resource.location) tags = [symbol["name"] for symbol in symbols if "name" in symbol] resource.update_extra_data({"source_symbols": tags}) + + +def collect_and_store_pygments_symbols_and_strings(project, logger=None): + """ + Collect symbols, strings and comments from codebase files using pygments and store + them in the extra data field. + """ + project_files = project.codebaseresources.files() + + resources = project_files.filter( + is_binary=False, + is_archive=False, + is_media=False, + ) + + resources_count = resources.count() + + resource_iterator = resources.iterator(chunk_size=2000) + progress = LoopProgress(resources_count, logger) + + for resource in progress.iter(resource_iterator): + _collect_and_store_pygments_symbols_and_strings(resource) + + +def _collect_and_store_pygments_symbols_and_strings(resource): + """ + Collect symbols, strings and comments from a resource using pygments and store + them in the extra data field. + """ + result = symbols_pygments.get_pygments_symbols(resource.location) + resource.update_extra_data({"source_symbols": result.get("source_symbols")}) + resource.update_extra_data({"source_strings": result.get("source_strings")}) + resource.update_extra_data({"source_comments": result.get("source_comments")}) diff --git a/scanpipe/tests/data/source-inspector/test3.cpp b/scanpipe/tests/data/source-inspector/test3.cpp new file mode 100644 index 000000000..5e2d12529 --- /dev/null +++ b/scanpipe/tests/data/source-inspector/test3.cpp @@ -0,0 +1,23 @@ +#include + +// Function to add two integers +int add(int a, int b) { + return a + b; +} + +// Function to subtract two integers +int subtract(int a, int b) { + return a - b; +} + +int main() { + int x = 10; + int y = 5; + + printf("Testing dummy functions:\n"); + + printf("Addition: %d + %d = %d\n", x, y, add(x, y)); + printf("Subtraction: %d - %d = %d\n", x, y, subtract(x, y)); + + return 0; +} diff --git a/scanpipe/tests/data/source-inspector/test3.cpp-pygments-expected.json b/scanpipe/tests/data/source-inspector/test3.cpp-pygments-expected.json new file mode 100644 index 000000000..8c955d616 --- /dev/null +++ b/scanpipe/tests/data/source-inspector/test3.cpp-pygments-expected.json @@ -0,0 +1,25 @@ +{ + "source_strings": [ + "10", + "5", + "\"", + "Testing dummy functions:", + "\\n", + "\"", + "\"", + "Addition: %d + %d = %d", + "\\n", + "\"", + "\"", + "Subtraction: %d - %d = %d", + "\\n", + "\"", + "0" + ], + "source_symbols": [ + "add", + "subtract", + "main" + ], + "source_comments": [] +} \ No newline at end of file diff --git a/scanpipe/tests/pipes/test_symbols.py b/scanpipe/tests/pipes/test_symbols.py index 328c100f3..be9660eeb 100644 --- a/scanpipe/tests/pipes/test_symbols.py +++ b/scanpipe/tests/pipes/test_symbols.py @@ -20,6 +20,7 @@ # ScanCode.io is a free software code scanning tool from nexB Inc. and others. # Visit https://github.com/nexB/scancode.io for support and download. +import json import sys from pathlib import Path from unittest import skipIf @@ -54,3 +55,27 @@ def test_scanpipe_pipes_symbols_collect_and_store_resource_symbols(self): result_extra_data_symbols = main_file.extra_data.get("source_symbols") expected_extra_data_symbols = ["generatePassword", "passwordLength", "charSet"] self.assertCountEqual(expected_extra_data_symbols, result_extra_data_symbols) + + def test_scanpipe_pipes_collect_and_store_pygments_symbols_and_strings(self): + dir = self.project1.codebase_path / "codefile" + dir.mkdir(parents=True) + + file_location = self.data_location / "source-inspector" / "test3.cpp" + copy_input(file_location, dir) + + pipes.collect_and_create_codebase_resources(self.project1) + + symbols.collect_and_store_pygments_symbols_and_strings(self.project1) + + main_file = self.project1.codebaseresources.files()[0] + + result_extra_data = main_file.extra_data + + expected_extra_data = ( + self.data_location / "source-inspector" / "test3.cpp-pygments-expected.json" + ) + + with open(expected_extra_data) as f: + expected_extra_data = json.load(f) + + self.assertDictEqual(expected_extra_data, result_extra_data) diff --git a/setup.cfg b/setup.cfg index 059a27b56..e7d3b1211 100644 --- a/setup.cfg +++ b/setup.cfg @@ -81,7 +81,7 @@ install_requires = elf-inspector==0.0.1 go-inspector==0.2.2 python-inspector==0.12.0 - source-inspector==0.3.0 + source-inspector==0.5.0 aboutcode-toolkit==10.1.0 # Utilities XlsxWriter==3.2.0 @@ -132,6 +132,7 @@ scancodeio_pipelines = analyze_docker_image = scanpipe.pipelines.docker:Docker analyze_root_filesystem_or_vm_image = scanpipe.pipelines.root_filesystem:RootFS analyze_windows_docker_image = scanpipe.pipelines.docker_windows:DockerWindows + collect_pygments_symbols = scanpipe.pipelines.collect_pygments_symbols:CollectPygmentsSymbolsAndStrings collect_source_strings = scanpipe.pipelines.collect_source_strings:CollectSourceStrings collect_symbols = scanpipe.pipelines.collect_symbols:CollectSymbols find_vulnerabilities = scanpipe.pipelines.find_vulnerabilities:FindVulnerabilities From f96888b8b3f10e8d2a337de6e2ad8c6986363681 Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Thu, 25 Apr 2024 19:18:33 +0530 Subject: [PATCH 2/7] Set is_addon to true Signed-off-by: Keshav Priyadarshi --- scanpipe/pipelines/collect_pygments_symbols.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scanpipe/pipelines/collect_pygments_symbols.py b/scanpipe/pipelines/collect_pygments_symbols.py index 59dec9976..6e64e8369 100644 --- a/scanpipe/pipelines/collect_pygments_symbols.py +++ b/scanpipe/pipelines/collect_pygments_symbols.py @@ -32,7 +32,7 @@ class CollectPygmentsSymbolsAndStrings(Pipeline): """ download_inputs = False - is_addon = False + is_addon = True @classmethod def steps(cls): From 01d6a2cfa41195afe8dcbe561bebf0cab1f006e5 Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Thu, 25 Apr 2024 19:23:46 +0530 Subject: [PATCH 3/7] Add integration test for pygments pipeline Signed-off-by: Keshav Priyadarshi --- scanpipe/tests/test_pipelines.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/scanpipe/tests/test_pipelines.py b/scanpipe/tests/test_pipelines.py index 1506154ce..b787676e0 100644 --- a/scanpipe/tests/test_pipelines.py +++ b/scanpipe/tests/test_pipelines.py @@ -1267,3 +1267,33 @@ def test_scanpipe_collect_source_strings_pipeline_integration(self): "Enter the desired length of your password:", ] self.assertCountEqual(expected_extra_data_strings, result_extra_data_strings) + + def test_scanpipe_collect_pygments_symbols_pipeline_integration(self): + pipeline_name = "collect_pygments_symbols" + project1 = Project.objects.create(name="Analysis") + + dir = project1.codebase_path / "codefile" + dir.mkdir(parents=True) + + file_location = self.data_location / "source-inspector" / "test3.cpp" + copy_input(file_location, dir) + + pipes.collect_and_create_codebase_resources(project1) + + run = project1.add_pipeline(pipeline_name) + pipeline = run.make_pipeline_instance() + + exitcode, out = pipeline.execute() + self.assertEqual(0, exitcode, msg=out) + + main_file = project1.codebaseresources.files()[0] + result_extra_data = main_file.extra_data + + expected_extra_data = ( + self.data_location / "source-inspector" / "test3.cpp-pygments-expected.json" + ) + + with open(expected_extra_data) as f: + expected_extra_data = json.load(f) + + self.assertDictEqual(expected_extra_data, result_extra_data) From 16d0b0d7b0d272ca89b7e08dcc35081665606446 Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Thu, 25 Apr 2024 19:29:09 +0530 Subject: [PATCH 4/7] Fix lints Signed-off-by: Keshav Priyadarshi --- .../pipelines/collect_tree_sitter_symbols.py | 48 +++++++++++++++++++ scanpipe/tests/test_pipelines.py | 2 +- 2 files changed, 49 insertions(+), 1 deletion(-) create mode 100644 scanpipe/pipelines/collect_tree_sitter_symbols.py diff --git a/scanpipe/pipelines/collect_tree_sitter_symbols.py b/scanpipe/pipelines/collect_tree_sitter_symbols.py new file mode 100644 index 000000000..e0bcdc263 --- /dev/null +++ b/scanpipe/pipelines/collect_tree_sitter_symbols.py @@ -0,0 +1,48 @@ +# SPDX-License-Identifier: Apache-2.0 +# +# http://nexb.com and https://github.com/nexB/scancode.io +# The ScanCode.io software is licensed under the Apache License version 2.0. +# Data generated with ScanCode.io is provided as-is without warranties. +# ScanCode is a trademark of nexB Inc. +# +# You may not use this software except in compliance with the License. +# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software distributed +# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR +# CONDITIONS OF ANY KIND, either express or implied. See the License for the +# specific language governing permissions and limitations under the License. +# +# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES +# OR CONDITIONS OF ANY KIND, either express or implied. No content created from +# ScanCode.io should be considered or used as legal advice. Consult an Attorney +# for any legal advice. +# +# ScanCode.io is a free software code scanning tool from nexB Inc. and others. +# Visit https://github.com/nexB/scancode.io for support and download. + +from scanpipe.pipelines import Pipeline +from scanpipe.pipes import symbols + + +class CollectTreeSitterSymbolsAndStrings(Pipeline): + """ + Collect codebase symbols using tree-sitter and keep them in extra data field. + + Also collect strings and comments. + """ + + download_inputs = False + is_addon = True + + @classmethod + def steps(cls): + return (cls.collect_and_store_tree_sitter_symbols_and_strings,) + + def collect_and_store_tree_sitter_symbols_and_strings(self): + """ + Collect symbols, strings and comments from codebase files using tree-sitter + and store them in the extra data field. + """ + symbols.collect_and_store_tree_sitter_symbols_and_strings( + self.project, self.log + ) diff --git a/scanpipe/tests/test_pipelines.py b/scanpipe/tests/test_pipelines.py index b787676e0..988be16a9 100644 --- a/scanpipe/tests/test_pipelines.py +++ b/scanpipe/tests/test_pipelines.py @@ -1267,7 +1267,7 @@ def test_scanpipe_collect_source_strings_pipeline_integration(self): "Enter the desired length of your password:", ] self.assertCountEqual(expected_extra_data_strings, result_extra_data_strings) - + def test_scanpipe_collect_pygments_symbols_pipeline_integration(self): pipeline_name = "collect_pygments_symbols" project1 = Project.objects.create(name="Analysis") From 434ad38fe5c84f672bd34d4762d5274131ac792b Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Thu, 25 Apr 2024 19:34:14 +0530 Subject: [PATCH 5/7] Address review Signed-off-by: Keshav Priyadarshi --- scanpipe/pipes/symbols.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/scanpipe/pipes/symbols.py b/scanpipe/pipes/symbols.py index 2ea3c9e81..297085200 100644 --- a/scanpipe/pipes/symbols.py +++ b/scanpipe/pipes/symbols.py @@ -96,6 +96,10 @@ def _collect_and_store_pygments_symbols_and_strings(resource): them in the extra data field. """ result = symbols_pygments.get_pygments_symbols(resource.location) - resource.update_extra_data({"source_symbols": result.get("source_symbols")}) - resource.update_extra_data({"source_strings": result.get("source_strings")}) - resource.update_extra_data({"source_comments": result.get("source_comments")}) + resource.update_extra_data( + { + "source_symbols": result.get("source_symbols"), + "source_strings": result.get("source_strings"), + "source_comments": result.get("source_comments"), + } + ) From d7bd806be4d2b40d1075e95edb085a6d96270af5 Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Thu, 25 Apr 2024 19:38:58 +0530 Subject: [PATCH 6/7] Add docs for pygments pipeline Signed-off-by: Keshav Priyadarshi --- docs/built-in-pipelines.rst | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/docs/built-in-pipelines.rst b/docs/built-in-pipelines.rst index 422d402f2..ed61e0bf0 100644 --- a/docs/built-in-pipelines.rst +++ b/docs/built-in-pipelines.rst @@ -42,6 +42,14 @@ Analyse Docker Windows Image :members: :member-order: bysource +.. _collect_pygments_symbols: + +Collect Pygments Source Strings (addon) +--------------------------------------- +.. autoclass:: scanpipe.pipelines.collect_pygments_symbols.CollectPygmentsSymbolsAndStrings() + :members: + :member-order: bysource + .. _pipeline_collect_source_strings: Collect Source Strings (addon) From bba82827e37ed2a9ad9f7b6a5893c946d08a1e46 Mon Sep 17 00:00:00 2001 From: Keshav Priyadarshi Date: Thu, 25 Apr 2024 20:33:35 +0530 Subject: [PATCH 7/7] Add CHANGELOG Signed-off-by: Keshav Priyadarshi --- CHANGELOG.rst | 4 ++++ docs/built-in-pipelines.rst | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 0f8e4c459..fabe28e91 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -11,6 +11,10 @@ v34.5.0 (unreleased) datafile_resource fields do not have a value. https://github.com/nexB/scancode.io/issues/1177 +- Add a new `CollectPygmentsSymbolsAndStrings` pipeline (addon) for collecting source + symbol, string and comments using Pygments. + https://github.com/nexB/scancode.io/pull/1179 + v34.4.0 (2024-04-22) -------------------- diff --git a/docs/built-in-pipelines.rst b/docs/built-in-pipelines.rst index ed61e0bf0..6d413c7b6 100644 --- a/docs/built-in-pipelines.rst +++ b/docs/built-in-pipelines.rst @@ -44,7 +44,7 @@ Analyse Docker Windows Image .. _collect_pygments_symbols: -Collect Pygments Source Strings (addon) +Collect Pygments Source Symbols (addon) --------------------------------------- .. autoclass:: scanpipe.pipelines.collect_pygments_symbols.CollectPygmentsSymbolsAndStrings() :members: