Datacon 2024 supply chain write up for QAQ

尝试1

首先看了今年复旦刚发的论文，主要分为三步，特征提取、顺序建模、bert 微调。由于没有公布源码，且没有黑白数据集，因此写了基于 pypi 包的特征提取与顺序建模器。但是后续考虑了三个因素，放弃了这个尝试。

因素1：论文效果不稳定，论文只指出能发现新的恶意包，并没有在大规模数据集上进行尝试，召回率与准确率无法保证。
因素2：特征提取与顺序建模器跑得太慢，优化后仍不理想。
因素3：没有太多 pypi 的恶意数据集，获得了 Backstabber 数据集。

方法流程如下：

具体技术细节如下：为了提取特征，使用 Tree-sitter 将 Python 和 JavaScript 代码转换为 AST 表示，并通过使用 Tree-sitter 提供的 AST 查询匹配语法结构来识别可疑行为。为了确定方法的优先级，使用 tree-sitter 来解析每个源代码。为了生成调用图，我们利用 PyCG（用于 Python）和 Jelly（用于 JavaScript）。在微调过程中，我们采用 Adam 优化器，学习率为 1e-6，批量大小为 1。模型经过 3 个 epoch 的训练。

我具体实现的源码如下（PYPI）：

# pypi_extractor_and_generator
import os
import re
import glob
import zipfile
import tarfile
from basic_utils import read_file, traverse_py_files
from tree_sitter import Language, Parser
from tqdm import tqdm

import warnings
warnings.filterwarnings("ignore", category=FutureWarning)

class PypiPipline:
    def __init__(self):

        self.unzip_control = True
        self.debug_mode = False
        self.parser = None
        self.PY_LANGUAGE = None

        self.packet_type = ["*.whl", "*.zip", "*.tar.gz", "*.tgz"]

        self.single_pyfile_import_funcs = set()
        self.single_pyfile_import_modules_complete = set()
        self.single_pyfile_import_modules_map = dict()
        self.single_pyfile_def_funcs = dict()

        self.single_pyfile_parse_result = set()

        # For feature extractor
        self.packet_parse_result = set()

        # For sequence generator
        self.packet_all_import_modules = set()
        self.packet_func_call_edge = dict()
        self.packet_all_funcs = set()

        self.install_running_funcs = set()
        self.import_running_funcs = set()
        self.running_exec_funcs = set()
        self.other_funcs = set()

        # Information Reading --------------------------------------------------------------------------------------------------------------
        # System module
        self.R1 = set(["os", "os.path", "sys", "platform", "getpass", "ctypes", "glob"])

        # System module call
        self.R2 = set(["os.path.join", "os.path.exists", "os.path.isabs", "ctypes.windll", "ctypes.cdll", "glob.glob", "glob.iglob"])

        # File system module
        self.R3 = set(["shutil", "pathlib", "tempfile", "io"])

        # File module call
        self.R4 = set(["os.mkdir", "os.makedirs", "os.remove", "os.rmdir", "os.removedirs", "os.rename", "os.listdir", "os.stat", \
                "os.path.isfile", "os.path.isdir", "os.path.basename", "os.path.dirname", "os.chdir", \
                "shutil.copy", "shutil.copytree", "shutil.move", "shutil.rmtree", "shutil.make_archive", "shutil.unpack_archive", \
                "pathlib.Path", "pathlib.Path.exists", "pathlib.Path.is_file", "pathlib.Path.is_dir", "pathlib.Path.mkdir", "pathlib.Path.rename", \
                "tempfile.TemporaryFile", "tempfile.NamedTemporaryFile", "tempfile.mkdtemp", \
                "os.chmod", "os.chown", \
                "io.open", "io.BytesIO", "io.StringIO" ])

        # Read sensitive information call
        self.R5 = set(["os.getpid", "os.getppid", "os.getenv", "os.putenv", "os.environ", "os.getlogin", "os.name", "os.getcwd", \
                "getpid", "getppid", "getenv", "putenv", "environ", "getlogin", "getcwd", \
                "sys.platform", "sys.version", \
                "platform", "version", \
                "socket.getfqdn", "getfqdn",
                "platform.system", "platform.version", "platform.release", "platform.machine", "platform.processor", "platform.node", \
                "psutil.cpu_times", "psutil.virtual_memory", "psutil.disk_usage", "psutil.net_if_addrs", "psutil.process_iter", "psutil.Process", \
                "win32api.GetUserName", "win32api.GetComputerName", "socket.gethostname", "socket.gethostbyname"])

        # Data Transmission ----------------------------------------------------------------------------------------------------------------
        # Network module
        self.D1 = set(["socket", "socketserver", "select", "requests", "http.client", "urllib", "paramiko", "smtplib", "asyncio"])

        # Use network module call
        self.D2 = set(["socket.socket", "socket.connect", "socket.bind", "socket.listen", "socket.accept", "socket.send", "socket.recv", "socket.close", \
                "socketserver.TCPServer", "socketserver.UDPServer", \
                "select.select", "select.poll", "select.epoll", \
                "requests.get", "requests.post", "requests.request", "requests.put", "requests.Request", "request.urlopen",\
                "http.client.HTTPConnection", \
                "urllib.request.urlopen", "urllib.request.Request", \
                "paramiko.SSHClient", \
                "smtplib.SMTP", "smtp.sendmail", \
                "asyncio.run", "asyncio.create_task"])
        
        # Sensitive string (ip addr, URL)
        self.D3 = set()

        # Encoding -------------------------------------------------------------------------------------------------------------------------
        # Encode module
        self.E1 = set(["codecs", "chardet", "base64", "hashlib", "unidecode"])

        # Encoding module call
        self.E2 = set(["codecs.encode", "codecs.open", "codecs.decode", "chardet.detect", "hashlib.md5", "hashlib.sha256"])

        # Base64 encoding call
        self.E3 = set(["base64.b64encode", "base64.b64decode", "base64.urlsafe_b64encode", "base64.urlsafe_b64decode"])

        # Long string (base64 table)
        self.E4 = set()

        # Payload execution ----------------------------------------------------------------------------------------------------------------
        # Process module
        self.P1 = set(["subprocess", "multiprocessing", "signal", "PyExecJS"])

        # Use process module call
        self.P2 = set(["os.spawn", "multiprocessing.Process", "multiprocessing.Queue", "multiprocessing.current_process", \
                "os.fork", "signal.signal", "threading.Thread"])

        # Use bash script
        self.P3 = set(["whoami", "chmod", "useradd", "rm", "curl", "/etc/shadow", "/passwd", "iptables", "nohup", "del", "systeminfo", "sc stop", "netstat", "psexec", "HKEY_LOCAL_MACHINE", \
                "net user", "net localgroup", "input/event0", "ifconfig", "uname", "hostname", "lsb_release", "lscpu", "cpuinfo", "fdisk", "ps aux", "systemctl", \
                "sudo", "auth.log", "wmic", "ipconfig", "tasklist", "eventvwr", "reg query"])

        # Execute code
        self.P4 = set(["os.system", "os.popen", "os.exec", "pyarmor", \
                "system", "popen", "exec", \
                "subprocess.run", "subprocess.Popen", "subprocess.call", "subprocess.check_output", "subprocess.Popen.communicate", \
                "eval", "execjs.compile"])
        
        self.action_mapping = {
            "M1": "suspicious package name", 
            "M2": "suspicious maintainer",
            "M3": "malicious dependencies",
            "M4": "abnormal publish time",
            "M5": "contain package install script",
            "M6": "contain executable file",

            "R1": "import operating system module",
            "R2": "use operating system module call",
            "R3": "import file system module",
            "R4": "use file system module call",
            "R5": "read sensitive information",

            "D1": "import network module",
            "D2": "use network module call",
            "D3": "use URL",

            "E1": "import encoding module",
            "E2": "use encoding module call",
            "E3": "use base64 module call",
            "E4": "use long string",

            "P1": "import process module",
            "P2": "use process module call",
            "P3": "use bash script",
            "P4": "evaluate code at run-time",
        }

    def package_unzip(self, pkg_path: str) -> str:

        if os.path.isfile(pkg_path) == 0:
            print("[-] package_unzip error")
            return ""

        if pkg_path[-4:] == ".whl" or pkg_path[-4:] == ".zip":
            pkg_path_without_suffix, _ = os.path.splitext(pkg_path)
            print(pkg_path, pkg_path_without_suffix)
            with zipfile.ZipFile(pkg_path, 'r') as zip_ref:
                if self.unzip_control:
                    zip_ref.extractall(pkg_path_without_suffix)
        elif pkg_path[-4:] == ".tgz" or pkg_path[-7:] == ".tar.gz":
            if pkg_path[-4:] == ".tgz": 
                pkg_path_without_suffix = pkg_path[:-4]
            elif pkg_path[-7:] == ".tar.gz": 
                pkg_path_without_suffix = pkg_path[:-7]
            with tarfile.open(pkg_path, "r:gz") as tar:
                tar.extractall(pkg_path_without_suffix)          
        else:
            print("[-] unknown pkg format error")
        
        return pkg_path_without_suffix

    def get_node_content(self, f_data: str, node) -> str:
        result = ""
        tmp = f_data.split("\n")
        lines_num = node.end_point[0] - node.start_point[0] + 1
        for j in range(lines_num):
            if lines_num == 1:
                result += tmp[node.start_point[0]][node.start_point[1]:node.end_point[1] + 1]
                break
            if j == 0:
                result += tmp[node.start_point[0]][node.start_point[1]:] + " "
            elif j != lines_num - 1:
                result += tmp[node.start_point[0] + j] + " "
            else:
                result += tmp[node.start_point[0] + j][:node.end_point[1] + 1]
        return result

    def single_pyfile_handler(self, f: str):

        def judge_statement(code: str):
            # Only concern functions
            func_pattern = r'\b([a-zA-Z_][a-zA-Z0-9_.]*)\('
            func_names = re.findall(func_pattern, code)
            result = []

            for f in func_names[::-1]:
                # Avoid "import A as B"
                if "." in f:
                    sp = f.split(".")
                    module_name = sp[0]
                    if module_name in self.single_pyfile_import_modules_map.keys():
                        module_name = self.single_pyfile_import_modules_map[module_name]
                    ff = module_name + "." + ".".join(sp[1:])
                else:
                    ff = f

                result.append(ff)

                # Judge call type
                if ff in self.R2: return result + ["R2"]
                elif ff in self.R4: return result + ["R4"]
                elif ff in self.R5: return result + ["R5"]
                elif ff in self.D2: return result + ["D2"]
                elif ff in self.E2: return result + ["E2"]
                elif ff in self.E3: return result + ["E3"]
                elif ff in self.P2: return result + ["P2"]
                elif ff in self.P4: return result + ["P4"]
                else: pass

            # call function another way
            for i in self.P4:
                if i in code: return result + ["P4"]
            for i in self.P2:
                if i in code: return result + ["P2"]
            for i in self.R5:
                if i in code: return result + ["R5"]
            for i in self.R2:
                if i in code: return result + ["R2"]
            for i in self.R4:
                if i in code: return result + ["R4"]
            for i in self.D2:
                if i in code: return result + ["D2"]
            for i in self.E2:
                if i in code: return result + ["E2"]

            # pyarmor
            if "pyarmor" in code: return result + ["P4"]

            # Long string
            string_pattern_1 = r'"([^"]*)"'
            matches_1 = re.findall(string_pattern_1, code)
            if len(matches_1) > 8: return result + ["E4"]
            for m in matches_1:
                if len(m) > 64:
                    return result + ["E4"]
            string_pattern_2 = r"'([^']*)'"
            matches_2 = re.findall(string_pattern_2, code)
            if len(matches_2) > 8: return result + ["E4"]
            for m in matches_2:
                if len(m) > 64:
                    return result + ["E4"]
            if len(code) > 256:
                return result + ["E4"]
                
            # URL and ip
            ipv4_pattern = r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b'
            ipv6_pattern = r'\b(?:[A-Fa-f0-9]{1,4}:){7}[A-Fa-f0-9]{1,4}\b'
            url_pattern = r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+'
            ipv4_match = re.findall(ipv4_pattern, code)
            ipv6_match = re.findall(ipv6_pattern, code)
            url_match = re.findall(url_pattern, code)
            if ipv4_match or ipv6_match or url_match:
                return result + ["D3"]
    
            return result + ["NOR"]

        def module_handler(node, f_data, module_name):
            assert(node.type == "module")

            self.packet_all_import_modules.add(module_name)

            for child in node.children:
                if child.type == "function_definition":
                    for c in child.children:
                        if c.type == "identifier":
                            content = self.get_node_content(f_data, c).replace("(", "")
                            if module_name not in self.single_pyfile_def_funcs.keys():
                                self.single_pyfile_def_funcs[module_name] = set([module_name + "." + content])
                            else:
                                s = self.single_pyfile_def_funcs[module_name]
                                s.add(module_name + "." + content)
                ast_handler(child, f_data, module_name, 0)
            return

        def func_name_map_to_complete(func_name, prefix):

            tmp1 = func_name.split(".")[0]

            for i in self.single_pyfile_import_funcs:
                if tmp1 == i.split(".")[-1]:
                    return ".".join(i.split(".")[:-1]) + "." + func_name
            
            return ".".join(prefix.split(".")[:-1]) + "." + func_name

        def expression_statement_handler(node, f_data, prefix):
            content = self.get_node_content(f_data, node)
            line = node.start_point[0] + 1

            if content[:3] == '"""' and content[-3:] == '"""':
                return [(0, 0, 0)]
            else:
                result = judge_statement(content)
                funcs = result[:-1]
                feature_id = result[-1]

                # Construct packet_func_call_edge
                if prefix not in self.packet_func_call_edge.keys():
                    self.packet_func_call_edge[prefix] = []
                tmp = [func_name_map_to_complete(fid, prefix) for fid in funcs]
                tmp2 = [(t, line) for t in tmp]
                self.packet_all_funcs.update(set(tmp2))
                self.packet_func_call_edge[prefix] += [i for i in tmp2 if i[0] != prefix]

                if prefix.split(".")[-1] == "setup":
                    self.install_running_funcs.update(set(tmp2))
                elif prefix.split(".")[-1] == "__init__" and prefix.count("__init__") == 1:
                    self.import_running_funcs.update(set(tmp2))

                if feature_id != "NOR":
                    return [(prefix, node.start_point[0] + 1, feature_id)]


            return [(0, 0, 0)]
        
        def import_statement_handler(node, f_data, prefix):
            content = self.get_node_content(f_data, node)
            if " as " in content and "," not in content:
                tmp_list = content.strip().split(" ")
                assert(len(tmp_list) == 4)
                k, v = tmp_list[3], tmp_list[1]
            elif " as " in content and "," in content:
                tmp_list = content.strip().split(",")
                for i in tmp_list:
                    if "import" in i and " as " in i:
                        tmp_list_2 = i.strip().split(" ")
                        assert(len(tmp_list_2) == 4)
                        k, v = tmp_list_2[3], tmp_list_2[1]
                        self.single_pyfile_import_modules_map[k] = v
                        self.single_pyfile_import_modules_complete.add(".".join(prefix.split(".")[:-1]) + "." + k)  
                        if v in self.R1: return [(prefix, node.start_point[0] + 1, "R1")]
                        elif v in self.R3: return [(prefix, node.start_point[0] + 1, "R3")]
                        elif v in self.D1: return [(prefix, node.start_point[0] + 1, "D1")]
                        elif v in self.E1: return [(prefix, node.start_point[0] + 1, "E1")]
                        elif v in self.P1: return [(prefix, node.start_point[0] + 1, "P1")]
                        else: pass
                        return [(0, 0, 0)] 
                    elif "import" not in i and " as " in i:
                        tmp_list_2 = i.strip().split(" ")
                        assert(len(tmp_list_2) == 3)
                        k, v = tmp_list_2[2], tmp_list_2[0]
                        self.single_pyfile_import_modules_map[k] = v
                        self.single_pyfile_import_modules_complete.add(".".join(prefix.split(".")[:-1]) + "." + k)  
                        if v in self.R1: return [(prefix, node.start_point[0] + 1, "R1")]
                        elif v in self.R3: return [(prefix, node.start_point[0] + 1, "R3")]
                        elif v in self.D1: return [(prefix, node.start_point[0] + 1, "D1")]
                        elif v in self.E1: return [(prefix, node.start_point[0] + 1, "E1")]
                        elif v in self.P1: return [(prefix, node.start_point[0] + 1, "P1")]
                        else: pass
                        return [(0, 0, 0)]                                                        
                    elif "import" not in i and " as " not in i:
                        k, v = i.strip(), i.strip()
                        self.single_pyfile_import_modules_map[k] = v
                        self.single_pyfile_import_modules_complete.add(".".join(prefix.split(".")[:-1]) + "." + k)  
                        if v in self.R1: return [(prefix, node.start_point[0] + 1, "R1")]
                        elif v in self.R3: return [(prefix, node.start_point[0] + 1, "R3")]
                        elif v in self.D1: return [(prefix, node.start_point[0] + 1, "D1")]
                        elif v in self.E1: return [(prefix, node.start_point[0] + 1, "E1")]
                        elif v in self.P1: return [(prefix, node.start_point[0] + 1, "P1")]
                        else: pass
                        return [(0, 0, 0)]                         

            elif "," in content:
                tmp_list = content.strip().split(",")
                tmp_list = [i.strip() for i in tmp_list]
                for i in tmp_list:
                    self.single_pyfile_import_modules_map[i] = i
                    self.single_pyfile_import_modules_complete.add(".".join(prefix.split(".")[:-1]) + "." + i)                  
                    if i in self.R1: return [(prefix, node.start_point[0] + 1, "R1")]
                    elif i in self.R3: return [(prefix, node.start_point[0] + 1, "R3")]
                    elif i in self.D1: return [(prefix, node.start_point[0] + 1, "D1")]
                    elif i in self.E1: return [(prefix, node.start_point[0] + 1, "E1")]
                    elif i in self.P1: return [(prefix, node.start_point[0] + 1, "P1")]
                    else: pass
                    return [(0, 0, 0)]
            else:
                tmp_list = content.strip().split(" ")
                assert(len(tmp_list) == 2)
                k, v = tmp_list[1], tmp_list[1]

            self.single_pyfile_import_modules_map[k] = v

            self.single_pyfile_import_modules_complete.add(".".join(prefix.split(".")[:-1]) + "." + k)
            
            if v in self.R1: return [(prefix, node.start_point[0] + 1, "R1")]
            elif v in self.R3: return [(prefix, node.start_point[0] + 1, "R3")]
            elif v in self.D1: return [(prefix, node.start_point[0] + 1, "D1")]
            elif v in self.E1: return [(prefix, node.start_point[0] + 1, "E1")]
            elif v in self.P1: return [(prefix, node.start_point[0] + 1, "P1")]
            else: pass

            return [(0, 0, 0)]

        def import_from_statement_handler(node, f_data, prefix):
            content = self.get_node_content(f_data, node)
            import_pattern_1 = r"from\s+(\S+)\s+import\s+([\w, _*]+)"
            import_pattern_2 = r"from\s+(\.*[\w\.]+)\s+import\s+\(([\s\S]*?)\)"
            import_pattern_3 = r"from\s+([\w\.]+)\s+import\s+([\\\s\w,]+as\s[\w\s,]+)"
            match_1 = re.search(import_pattern_1, content)
            match_2 = re.search(import_pattern_2, content)
            match_3 = re.search(import_pattern_3, content)
            if match_1: # not handle *
                module_name = match_1.group(1)
                functions = match_1.group(2).split(',')
                functions = [func.strip() for func in functions]
                functions = [i for i in functions if i != ""]
                self.single_pyfile_import_funcs.update(set([".".join(prefix.split(".")[:-1]) + "." + module_name + "." + func for func in functions]))
            elif match_2: # from ..?xxx import (xxx, xxx, xxx, ...)
                module_name = match_2.group(1)
                count = 0
                for char in module_name:
                    if char == '.': count += 1
                    else: break
                module_name = module_name[count:]
                
                functions = [i.strip() for i in match_2.group(2).split(',')]
                tmp_functions = [i for i in functions if i != ""]
                functions = []
                for fn in tmp_functions:
                    if " as " in fn:
                        tmp_list = fn.strip().split(" ")
                        tmp_list = [i for i in tmp_list if i != ""]
                        fn1, fn2 = tmp_list[2], tmp_list[0]
                        functions += [fn1, fn2]
                    else:
                        functions += [fn]
                upset = set([".".join(prefix.split(".")[:-1]) + "." + module_name + "." + func for func in functions])
                self.single_pyfile_import_funcs.update(upset)         
            elif match_3:
                module_name = match_3.group(1)
                count = 0
                for char in module_name:
                    if char == '.': count += 1
                    else: break
                module_name = module_name[count:]
                
                functions = [i.strip() for i in match_3.group(2).split(',')]
                functions = [i.replace("\\", "").strip() for i in functions if i != ""]
                tmp_functions = functions
                functions = []
                for fn in tmp_functions:
                    if " as " in fn:
                        tmp_list = fn.strip().split(" ")
                        assert(len(tmp_list) == 3)
                        fn1, fn2 = tmp_list[2], tmp_list[0]
                        functions += [fn1, fn2]
                    else:
                        functions += [fn]

                upset = set([".".join(prefix.split(".")[:-1]) + "." + module_name + "." + func for func in functions])    
            else:
                print("[-] import_from_statement parse error", content)
                return [(0, 0, 0)]

            if module_name in self.R1: return [(prefix, node.start_point[0] + 1, "R1")]
            elif module_name in self.R3: return [(prefix, node.start_point[0] + 1, "R3")]
            elif module_name in self.D1: return [(prefix, node.start_point[0] + 1, "D1")]
            elif module_name in self.E1: return [(prefix, node.start_point[0] + 1, "E1")]
            elif module_name in self.P1: return [(prefix, node.start_point[0] + 1, "P1")]
            else: pass

            return [(0, 0, 0)]

        def try_statement_handler(node, f_data, prefix, indent):
            for child in node.children:
                if child.type == "block":
                    ast_handler(child, f_data, prefix, indent + 2)
                    break
            return
        
        def class_definition_handler(node, f_data, prefix, indent):

            # Add class name to prefix
            for child in node.children:
                if child.type == "identifier":
                    content = self.get_node_content(f_data, child)[:-1]
                    prefix += "." + content

            for child in node.children:
                ast_handler(child, f_data, prefix, indent + 2)

            return [(0, 0, 0)]   
        
        def function_definition_handler(node, f_data, prefix, indent):

            # Add function name to prefix
            for child in node.children:
                if child.type == "identifier":
                    content = self.get_node_content(f_data, child).replace("(", "")
                    prefix += "." + content
                    break
            
            self.packet_all_funcs.add((prefix, node.start_point[0] + 1))
            for child in node.children:
                ast_handler(child, f_data, prefix, indent + 2)

            return [(0, 0, 0)]   

        def decorated_definition_handler(node, f_data, prefix):
            return [(0, 0, 0)]

        def if_statement_handler(node, f_data, prefix):
            return [(0, 0, 0)]

        def ast_handler(node, f_data, prefix, indent=0):

            # For test
            """
            print(' ' * indent + f'{node.type} ({node.start_point} - {node.end_point})', end = " ")
            if len(node.children) == 0:
                tmp = f_data.split("\n")
                lines_num = node.end_point[0] - node.start_point[0] + 1
                for j in range(lines_num):
                    if lines_num == 1:
                        print(tmp[node.start_point[0]][node.start_point[1]:node.end_point[1] + 1])
                        break
                    
                    if j == 0:
                        print(tmp[node.start_point[0]][node.start_point[1]:], end = " ")
                    elif j != lines_num - 1:
                        print(tmp[node.start_point[0] + j], end = " ")
                    else:
                        print(tmp[node.start_point[0] + j][:node.end_point[1] + 1])
            else:
                print("")
            #"""

            if node.type == "comment":
                pass
            elif node.type == "expression_statement":
                self.single_pyfile_parse_result.update(set(expression_statement_handler(node, f_data, prefix)))
            elif node.type == "import_statement":
                self.single_pyfile_parse_result.update(set(import_statement_handler(node, f_data, prefix)))
            elif node.type == "import_from_statement":
                self.single_pyfile_parse_result.update(set(import_from_statement_handler(node, f_data, prefix)))
            elif node.type == "try_statement":
                try_statement_handler(node, f_data, prefix, indent)
            elif node.type == "class_definition":
                class_definition_handler(node, f_data, prefix, indent)
            elif node.type == "function_definition":
                function_definition_handler(node, f_data, prefix, indent)
            # elif node.type == "decorated_definition":
            #     decorated_definition_handler(node, f_data, prefix)
            # elif node.type == "if_statement":
            #     if_statement_handler(node, f_data, prefix)
            elif node.type == "module":
                pass
            else:
                for child in node.children:
                    ast_handler(child, f_data, prefix, indent + 2)

        self.single_pyfile_import_modules_map = dict()
        self.single_pyfile_import_funcs = set()
        self.single_pyfile_import_modules_complete = set()
        self.single_pyfile_parse_result = set()

        if self.debug_mode: print("[-] handle file:", f)

        f_data, module_name = read_file(f)
        tree = self.parser.parse(bytes(f_data, 'utf-8'))
        module_handler(tree.root_node, f_data, module_name)

        self.packet_parse_result.update(self.single_pyfile_parse_result)
        self.single_pyfile_parse_result = set()

        self.packet_all_import_modules.update(self.single_pyfile_import_modules_complete)
        tmp = set()
        for m in self.single_pyfile_import_funcs:
            mm = m.split(".")
            for i in range(len(mm) - 1):
                if mm[i] == mm[i + 1]:
                    tmp.add(".".join(mm[:i]) + "." + ".".join(mm[i + 1:]))
                    break
        self.packet_all_import_modules.update(tmp)

        self.single_pyfile_import_modules_complete = set()
        self.single_pyfile_import_funcs = set()
        return

    def single_package_handler(self, unpacked_path: str):
        py_files = traverse_py_files(unpacked_path)
        for f in py_files:
            self.single_pyfile_handler(f)

        return

    def extractor_init(self) -> bool:

        Language.build_library('deps/build/parser.so', 
                            ['deps/tree-sitter-python', './deps/tree-sitter-javascript'])
        
        self.PY_LANGUAGE = Language('deps/build/parser.so', 'python')
        self.parser = Parser()
        self.parser.set_language(self.PY_LANGUAGE)

        if self.debug_mode: print("[-] tree_sitter init success")
        return True

    def generate_action_chain(self, func_name, order, pre_result, call_times=0):

        call_times += 1
        if call_times > 5: return pre_result

        if func_name in self.packet_func_call_edge.keys():
            next_funcs = self.packet_func_call_edge[func_name]
        else:
            next_funcs = []
        
        hint_action = []
        for it in self.packet_parse_result:
            if it[0] == func_name:
                hint_action.append(it)

        result = pre_result
        sorted_list = sorted(next_funcs + hint_action, key=lambda x: x[1])
        for it in sorted_list:
            assert(len(it) == 2 or len(it) == 3)
            if len(it) == 2:
                result += self.generate_action_chain(it[0], order, [], call_times)
            else:
                result += [it]

        return result
    
    def clear(self):
        self.single_pyfile_import_funcs = set()
        self.single_pyfile_import_modules_complete = set()
        self.single_pyfile_import_modules_map = dict()
        self.single_pyfile_def_funcs = dict()

        self.single_pyfile_parse_result = set()

        self.packet_parse_result = set()

        self.packet_all_import_modules = set()
        self.packet_func_call_edge = dict()
        self.packet_all_funcs = set()

        self.install_running_funcs = set()
        self.import_running_funcs = set()
        self.running_exec_funcs = set()
        self.other_funcs = set()

    def run(self, pypi_normal_path="dataset/PYPI/normal/"):

        self.extractor_init()

        for pt in self.packet_type:
            pattern = pypi_normal_path + pt
            for f in tqdm(glob.glob(pattern)):
                unpacked_path = self.package_unzip(f)
                self.single_package_handler(unpacked_path)

                print("packet name: ", unpacked_path)

                # For sequence generator
                # Step1
                for m in self.packet_all_import_modules:
                    if m in self.packet_func_call_edge.keys():
                        self.import_running_funcs.update(set(self.packet_func_call_edge[m]))
                        self.import_running_funcs.add((m, 0))
                for fc in self.packet_all_funcs:
                    if "__" not in fc[0]:
                        self.running_exec_funcs.add(fc)
                    else:
                        self.other_funcs.add(fc)
                
                # Step2
                M = (self.install_running_funcs, self.import_running_funcs, self.running_exec_funcs, self.other_funcs)

                # Step3
                for i in range(len(M)):
                    funcs = M[i]
                    for fn in funcs:
                        func_name = fn[0]
                        result = self.generate_action_chain(func_name, i, [])
                        if result != []:
                            for r in result:
                                print(self.action_mapping[r[2]])

                print("-----------------------------------------------------------------------")

                self.clear()

尝试2

技术细节：

结合开源 guarddog 工具搜索。
字符串匹配，包括匹配 ip 正则、常见敏感函数。
针对混淆，计算每个文件的熵。

最终也是取得了 32.73 分，排名 27。源码如下：

# simple_pypi_dt.py
from tqdm import tqdm
import os
import tarfile
import shutil
import re
import math
from pathlib import Path
from collections import Counter

def calculate_entropy(string):
    p, lns = Counter(string), float(len(string))
    return -sum(count/lns * math.log(count/lns, 2) for count in p.values())

ip_pattern = r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b'

base_pypi_path="/home/wd2711/Desktop/datacon/dataset/PYPI/test/datacon_pypi_part"

sensitive_set = \
{
    "urlsafe_b64decode" : 30, 
    "os.system" : 50, 
    "pyarmor" : 100, 
    "exec" : 50,
    "popen" : 40, 
    "eval" : 100, 
    "base64.b64decode" : 30, 
    "base64.b64encode" : 30, 
    "whoami" : 100, 
    "/passwd" : 100, 
    "environ" : 60, 
    "getpid" : 50, 
    "getenv" : 60, 
    "getcwd" : 30, 
    "subprocess.run" : 40, 
    "getlogin" : 100,
    "accept" : 20, 
    "gethostname" : 100, 
    "urlopen" : 30, 
    "b64encode" : 40, 
    "pyexecjs" : 40, 
    "cpuinfo" : 40,
    ".com" : 50,
    "requests.post" : 50,
    "socket.send" : 50,
    "requests.send" : 50,
    "asyncio.run" : 60,
    "getfqdn" : 70,
    "create_task" : 40,
    "select.poll" : 40,
    "net_if_addrs" : 80,
    "disk_usage" : 80,
    "getusername" : 80,
    "gethostbyname" : 80,
    "platform" : 30,
    "chown" : 100,
    "chmod" : 100,
    "base64" : 30,
}

file_score = \
{
    "init" : 70,
    "setup" : 100,
    "main" : 70,
}

def handle_tgz(path):

    ext_path = path[:-4]

    with tarfile.open(path, 'r:gz') as file:
        file.extractall(path=ext_path)

    ext_path_1 = Path(ext_path)

    pkg_entropy = 0

    score = 0
    data_len = 0
    file_s = 0
    for file_path in ext_path_1.rglob('*.py'):
        try:
            with file_path.open('r', encoding='utf-8') as file:
                data = file.read().lower()

            for k in iter(file_score.keys()):
                if k in str(file_path):
                    file_s = file_score[k]
            
            data_len += len(data)
            for k in sensitive_set.keys():
                if k in data:
                    score += sensitive_set[k] + file_s

            found_ips = re.findall(ip_pattern, data)
            if found_ips:
                score += 100 + file_s

            plus_count = data.count('+')
            if plus_count > 10:
                score += 100 + file_s

            py_entropy = calculate_entropy(data)
            pkg_entropy = max(py_entropy, pkg_entropy)

        except Exception as e:
            continue
    
    if data_len != 0 and score != 0:
        print("{} {} {} {:.2f} {:.2f}".format(path, score, data_len, score / data_len, pkg_entropy))

    shutil.rmtree(ext_path)
    return

for i in range(1, 6):
    dir_path = base_pypi_path + str(i)

    for root, dirs, files in os.walk(dir_path):
        for file in tqdm(files):
            if file.endswith('.tgz'):
                full_path = os.path.join(root, file)
                handle_tgz(full_path)

# simple_npm_dt.py
from tqdm import tqdm
import os
import tarfile
import shutil
import re
import math
from pathlib import Path
from collections import Counter

def calculate_entropy(string):
    p, lns = Counter(string), float(len(string))
    return -sum(count/lns * math.log(count/lns, 2) for count in p.values())

ip_pattern = r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b'

base_npm_path="/home/wd2711/Desktop/datacon/dataset/NPM/test/datacon_npm_part"

sensitive_set = \
{
    "evalu" : -100,
    "eval" : 100, 
    "fork" : 20, 
    "unlinksync" : 20, 
    "exec" : 50, 
    "runinnewcontext" : 50, 
    "process.env" : 30, 
    "hostname" : 50, 
    "writefile" : 10, 
    "readfilesync" : 30, 
    "readfile" : 30, 
    "http.request" : 30, 
    "spawn" : 30, 
    "os.platform" : 50, 
    "connect" : 15, 
    "url.parse" : 10, 
    "base64" : 50,
    "axios" : 15, 
    "setinterval" : 10, 
    "buffer.from" : 15,
    ".bin" : 50,
    ".com" : 50,
    ".comp" : -50,
    "child_process" : 30,
    "btoa" : 50,
    "atob" : 50,
    "passwd" : 100,
    "curl" : 50,
    "post" : 30,
    "createcipher" : 50,
    "chmod" : 100,
    "https.get" : 50,
    "/bin/sh" : 100,
    "id_rsa" : 100,
    "homedir" : 30,
    "sendbeacon" : 30,
    "addeventlistener" : 20,
    "keypress" : 50,
    "localhost" : -100,
    "userinfo" : 70,
    "getservers" : 70,
}

file_score = \
{
    "init" : 70,
    "setup" : 100,
    "main" : 70,
    "index" : 100,
    "build" : 100,
}

def handle_tgz(path):
    ext_path = path[:-4]

    with tarfile.open(path, 'r:gz') as file:
        file.extractall(path=ext_path)

    ext_path_1 = Path(ext_path)

    score = 0
    data_len = 0
    file_s = 0

    pkg_entropy = 0

    for file_path in ext_path_1.rglob('*.js'):
        try:
            with file_path.open('r', encoding='utf-8') as file:
                data = file.read().lower()

            for k in iter(file_score.keys()):
                if k in str(file_path):
                    file_s = file_score[k]

            data_len += len(data)
            for k in sensitive_set.keys():
                if k in data:
                    if sensitive_set[k] < 0:
                        score += sensitive_set[k] - file_s
                    else:
                        score += sensitive_set[k] + file_s
            
            
            found_ips = re.findall(ip_pattern, data)
            if found_ips:
                score += 100 + file_s

            plus_count = data.count('+')
            if plus_count > 10:
                score += 100 + file_s

            js_entropy = calculate_entropy(data)
            pkg_entropy = max(js_entropy, pkg_entropy)
            

        except Exception as e:
            continue

    if data_len != 0 and score != 0:
        print("{} {} {} {:.2f} {:.2f}".format(path, score, data_len, score / data_len, pkg_entropy))

    shutil.rmtree(ext_path)
    return

for i in range(1, 6):
    dir_path = base_npm_path + str(i)

    for root, dirs, files in os.walk(dir_path):
        for file in tqdm(files):
            if file.endswith('.tgz'):
                full_path = os.path.join(root, file)
                handle_tgz(full_path)

结果

留言

作者: wd-z711
文章链接: https://wd-2711.tech/
版权声明: 本博客所有文章除特别声明外，均采用 CC BY-NC-ND 4.0 许可协议。转载请注明出处!

⬆︎TOP

datacon-2024

Datacon 2024 supply chain write up for QAQ

尝试1

尝试2

结果

留言