Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Addon pipeline to collect pygments symbols #1179

Merged
merged 7 commits into from
Apr 26, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ v34.5.0 (unreleased)
datafile_resource fields do not have a value.
https:/nexB/scancode.io/issues/1177

- Add a new `CollectPygmentsSymbolsAndStrings` pipeline (addon) for collecting source
symbol, string and comments using Pygments.
https:/nexB/scancode.io/pull/1179

v34.4.0 (2024-04-22)
--------------------

Expand Down
8 changes: 8 additions & 0 deletions docs/built-in-pipelines.rst
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,14 @@ Analyse Docker Windows Image
:members:
:member-order: bysource

.. _collect_pygments_symbols:

Collect Pygments Source Symbols (addon)
---------------------------------------
.. autoclass:: scanpipe.pipelines.collect_pygments_symbols.CollectPygmentsSymbolsAndStrings()
:members:
:member-order: bysource

.. _pipeline_collect_source_strings:

Collect Source Strings (addon)
Expand Down
46 changes: 46 additions & 0 deletions scanpipe/pipelines/collect_pygments_symbols.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https:/nexB/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https:/nexB/scancode.io for support and download.

from scanpipe.pipelines import Pipeline
from scanpipe.pipes import symbols


class CollectPygmentsSymbolsAndStrings(Pipeline):
"""
Collect codebase symbols using pygments and keep them in extra data field.

Also collect strings and comments.
"""

download_inputs = False
is_addon = True

@classmethod
def steps(cls):
return (cls.collect_and_store_pygments_symbols_and_strings,)

def collect_and_store_pygments_symbols_and_strings(self):
"""
Collect symbols, strings and comments from codebase files using pygments
and store them in the extra data field.
"""
symbols.collect_and_store_pygments_symbols_and_strings(self.project, self.log)
48 changes: 48 additions & 0 deletions scanpipe/pipelines/collect_tree_sitter_symbols.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
# SPDX-License-Identifier: Apache-2.0
#
# http://nexb.com and https:/nexB/scancode.io
# The ScanCode.io software is licensed under the Apache License version 2.0.
# Data generated with ScanCode.io is provided as-is without warranties.
# ScanCode is a trademark of nexB Inc.
#
# You may not use this software except in compliance with the License.
# You may obtain a copy of the License at: http://apache.org/licenses/LICENSE-2.0
# Unless required by applicable law or agreed to in writing, software distributed
# under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR
# CONDITIONS OF ANY KIND, either express or implied. See the License for the
# specific language governing permissions and limitations under the License.
#
# Data Generated with ScanCode.io is provided on an "AS IS" BASIS, WITHOUT WARRANTIES
# OR CONDITIONS OF ANY KIND, either express or implied. No content created from
# ScanCode.io should be considered or used as legal advice. Consult an Attorney
# for any legal advice.
#
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https:/nexB/scancode.io for support and download.

from scanpipe.pipelines import Pipeline
from scanpipe.pipes import symbols


class CollectTreeSitterSymbolsAndStrings(Pipeline):
"""
Collect codebase symbols using tree-sitter and keep them in extra data field.

Also collect strings and comments.
"""

download_inputs = False
is_addon = True

@classmethod
def steps(cls):
return (cls.collect_and_store_tree_sitter_symbols_and_strings,)

def collect_and_store_tree_sitter_symbols_and_strings(self):
"""
Collect symbols, strings and comments from codebase files using tree-sitter
and store them in the extra data field.
"""
symbols.collect_and_store_tree_sitter_symbols_and_strings(
self.project, self.log
)
38 changes: 38 additions & 0 deletions scanpipe/pipes/symbols.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
# Visit https:/nexB/scancode.io for support and download.

from source_inspector import symbols_ctags
from source_inspector import symbols_pygments

from scanpipe.pipes import LoopProgress

Expand Down Expand Up @@ -65,3 +66,40 @@ def _collect_and_store_resource_symbols(resource):
symbols = symbols_ctags.collect_symbols(resource.location)
tags = [symbol["name"] for symbol in symbols if "name" in symbol]
resource.update_extra_data({"source_symbols": tags})


def collect_and_store_pygments_symbols_and_strings(project, logger=None):
"""
Collect symbols, strings and comments from codebase files using pygments and store
them in the extra data field.
"""
project_files = project.codebaseresources.files()

resources = project_files.filter(
is_binary=False,
is_archive=False,
is_media=False,
)

resources_count = resources.count()

resource_iterator = resources.iterator(chunk_size=2000)
progress = LoopProgress(resources_count, logger)

for resource in progress.iter(resource_iterator):
_collect_and_store_pygments_symbols_and_strings(resource)


def _collect_and_store_pygments_symbols_and_strings(resource):
"""
Collect symbols, strings and comments from a resource using pygments and store
them in the extra data field.
"""
result = symbols_pygments.get_pygments_symbols(resource.location)
resource.update_extra_data(
{
"source_symbols": result.get("source_symbols"),
"source_strings": result.get("source_strings"),
"source_comments": result.get("source_comments"),
}
)
23 changes: 23 additions & 0 deletions scanpipe/tests/data/source-inspector/test3.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
#include <stdio.h>

// Function to add two integers
int add(int a, int b) {
return a + b;
}

// Function to subtract two integers
int subtract(int a, int b) {
return a - b;
}

int main() {
int x = 10;
int y = 5;

printf("Testing dummy functions:\n");

printf("Addition: %d + %d = %d\n", x, y, add(x, y));
printf("Subtraction: %d - %d = %d\n", x, y, subtract(x, y));

return 0;
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"source_strings": [
"10",
"5",
"\"",
"Testing dummy functions:",
"\\n",
"\"",
"\"",
"Addition: %d + %d = %d",
"\\n",
"\"",
"\"",
"Subtraction: %d - %d = %d",
"\\n",
"\"",
"0"
],
"source_symbols": [
"add",
"subtract",
"main"
],
"source_comments": []
}
25 changes: 25 additions & 0 deletions scanpipe/tests/pipes/test_symbols.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
# ScanCode.io is a free software code scanning tool from nexB Inc. and others.
# Visit https:/nexB/scancode.io for support and download.

import json
import sys
from pathlib import Path
from unittest import skipIf
Expand Down Expand Up @@ -54,3 +55,27 @@ def test_scanpipe_pipes_symbols_collect_and_store_resource_symbols(self):
result_extra_data_symbols = main_file.extra_data.get("source_symbols")
expected_extra_data_symbols = ["generatePassword", "passwordLength", "charSet"]
self.assertCountEqual(expected_extra_data_symbols, result_extra_data_symbols)

def test_scanpipe_pipes_collect_and_store_pygments_symbols_and_strings(self):
dir = self.project1.codebase_path / "codefile"
dir.mkdir(parents=True)

file_location = self.data_location / "source-inspector" / "test3.cpp"
copy_input(file_location, dir)

pipes.collect_and_create_codebase_resources(self.project1)

symbols.collect_and_store_pygments_symbols_and_strings(self.project1)

main_file = self.project1.codebaseresources.files()[0]

result_extra_data = main_file.extra_data

expected_extra_data = (
self.data_location / "source-inspector" / "test3.cpp-pygments-expected.json"
)

with open(expected_extra_data) as f:
expected_extra_data = json.load(f)

self.assertDictEqual(expected_extra_data, result_extra_data)
30 changes: 30 additions & 0 deletions scanpipe/tests/test_pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -1267,3 +1267,33 @@ def test_scanpipe_collect_source_strings_pipeline_integration(self):
"Enter the desired length of your password:",
]
self.assertCountEqual(expected_extra_data_strings, result_extra_data_strings)

def test_scanpipe_collect_pygments_symbols_pipeline_integration(self):
pipeline_name = "collect_pygments_symbols"
project1 = Project.objects.create(name="Analysis")

dir = project1.codebase_path / "codefile"
dir.mkdir(parents=True)

file_location = self.data_location / "source-inspector" / "test3.cpp"
copy_input(file_location, dir)

pipes.collect_and_create_codebase_resources(project1)

run = project1.add_pipeline(pipeline_name)
pipeline = run.make_pipeline_instance()

exitcode, out = pipeline.execute()
self.assertEqual(0, exitcode, msg=out)

main_file = project1.codebaseresources.files()[0]
result_extra_data = main_file.extra_data

expected_extra_data = (
self.data_location / "source-inspector" / "test3.cpp-pygments-expected.json"
)

with open(expected_extra_data) as f:
expected_extra_data = json.load(f)

self.assertDictEqual(expected_extra_data, result_extra_data)
3 changes: 2 additions & 1 deletion setup.cfg
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ install_requires =
elf-inspector==0.0.1
go-inspector==0.2.2
python-inspector==0.12.0
source-inspector==0.3.0
source-inspector==0.5.0
aboutcode-toolkit==10.1.0
# Utilities
XlsxWriter==3.2.0
Expand Down Expand Up @@ -132,6 +132,7 @@ scancodeio_pipelines =
analyze_docker_image = scanpipe.pipelines.docker:Docker
analyze_root_filesystem_or_vm_image = scanpipe.pipelines.root_filesystem:RootFS
analyze_windows_docker_image = scanpipe.pipelines.docker_windows:DockerWindows
collect_pygments_symbols = scanpipe.pipelines.collect_pygments_symbols:CollectPygmentsSymbolsAndStrings
collect_source_strings = scanpipe.pipelines.collect_source_strings:CollectSourceStrings
collect_symbols = scanpipe.pipelines.collect_symbols:CollectSymbols
find_vulnerabilities = scanpipe.pipelines.find_vulnerabilities:FindVulnerabilities
Expand Down