def create_packages_archive()

in luigi/contrib/hadoop.py [0:0]


def create_packages_archive(packages, filename):
    """
    Create a tar archive which will contain the files for the packages listed in packages.
    """
    import tarfile
    tar = tarfile.open(filename, "w")

    def add(src, dst):
        logger.debug('adding to tar: %s -> %s', src, dst)
        tar.add(src, dst)

    def add_files_for_package(sub_package_path, root_package_path, root_package_name):
        for root, dirs, files in os.walk(sub_package_path):
            if '.svn' in dirs:
                dirs.remove('.svn')
            for f in files:
                if not f.endswith(".pyc") and not f.startswith("."):
                    add(dereference(root + "/" + f), root.replace(root_package_path, root_package_name) + "/" + f)

    for package in packages:
        # Put a submodule's entire package in the archive. This is the
        # magic that usually packages everything you need without
        # having to attach packages/modules explicitly
        if not getattr(package, "__path__", None) and '.' in package.__name__:
            package = __import__(package.__name__.rpartition('.')[0], None, None, 'non_empty')

        n = package.__name__.replace(".", "/")

        if getattr(package, "__path__", None):
            # TODO: (BUG) picking only the first path does not
            # properly deal with namespaced packages in different
            # directories
            p = package.__path__[0]

            if p.endswith('.egg') and os.path.isfile(p):
                raise 'egg files not supported!!!'
                # Add the entire egg file
                # p = p[:p.find('.egg') + 4]
                # add(dereference(p), os.path.basename(p))

            else:
                # include __init__ files from parent projects
                root = []
                for parent in package.__name__.split('.')[0:-1]:
                    root.append(parent)
                    module_name = '.'.join(root)
                    directory = '/'.join(root)

                    add(dereference(__import__(module_name, None, None, 'non_empty').__path__[0] + "/__init__.py"),
                        directory + "/__init__.py")

                add_files_for_package(p, p, n)

                # include egg-info directories that are parallel:
                for egg_info_path in glob.glob(p + '*.egg-info'):
                    logger.debug(
                        'Adding package metadata to archive for "%s" found at "%s"',
                        package.__name__,
                        egg_info_path
                    )
                    add_files_for_package(egg_info_path, p, n)

        else:
            f = package.__file__
            if f.endswith("pyc"):
                f = f[:-3] + "py"
            if n.find(".") == -1:
                add(dereference(f), os.path.basename(f))
            else:
                add(dereference(f), n + ".py")
    tar.close()