Filtrado del sistema operativo.dirs y archivos walk()


Estoy buscando una manera de incluir/excluir patrones de archivos y excluir directorios de una llamada os.walk().

Esto es lo que estoy haciendo ahora:

import fnmatch
import os

includes = ['*.doc', '*.odt']
excludes = ['/home/paulo-freitas/Documents']

def _filter(paths):
    matches = []

    for path in paths:
        append = None

        for include in includes:
            if os.path.isdir(path):
                append = True
                break

            if fnmatch.fnmatch(path, include):
                append = True
                break

        for exclude in excludes:
            if os.path.isdir(path) and path == exclude:
                append = False
                break

            if fnmatch.fnmatch(path, exclude):
                append = False
                break

        if append:
            matches.append(path)

    return matches

for root, dirs, files in os.walk('/home/paulo-freitas'):
    dirs[:] = _filter(map(lambda d: os.path.join(root, d), dirs))
    files[:] = _filter(map(lambda f: os.path.join(root, f), files))

    for filename in files:
        filename = os.path.join(root, filename)

        print filename

La pregunta es: ¿hay una mejor manera de hacer esto? ¿Cómo?

Author: Paulo Freitas, 2011-02-28

8 answers

Esta solución utiliza fnmatch.translate para convertir patrones glob a expresiones regulares (asume que el includes solo se usa para archivos):

import fnmatch
import os
import os.path
import re

includes = ['*.doc', '*.odt'] # for files only
excludes = ['/home/paulo-freitas/Documents'] # for dirs and files

# transform glob patterns to regular expressions
includes = r'|'.join([fnmatch.translate(x) for x in includes])
excludes = r'|'.join([fnmatch.translate(x) for x in excludes]) or r'$.'

for root, dirs, files in os.walk('/home/paulo-freitas'):

    # exclude dirs
    dirs[:] = [os.path.join(root, d) for d in dirs]
    dirs[:] = [d for d in dirs if not re.match(excludes, d)]

    # exclude/include files
    files = [os.path.join(root, f) for f in files]
    files = [f for f in files if not re.match(excludes, f)]
    files = [f for f in files if re.match(includes, f)]

    for fname in files:
        print fname
 46
Author: Oben Sonne,
Warning: date(): Invalid date.timezone value 'Europe/Kyiv', we selected the timezone 'UTC' for now. in /var/www/agent_stack/data/www/ajaxhispano.com/template/agent.layouts/content.php on line 61
2011-02-28 14:30:36

De docs.python.org:

Os.walk (top [, topdown = True [, onerror = None [, followlinks = False]]])

Cuando topdown es True, el llamante puede modificar la lista de dirnames in-place this esto se puede usar para podar la búsqueda {

for root, dirs, files in os.walk('/home/paulo-freitas', topdown=True):
    # excludes can be done with fnmatch.filter and complementary set,
    # but it's more annoying to read.
    dirs[:] = [d for d in dirs if d not in excludes] 
    for pat in includes:
        for f in fnmatch.filter(files, pat):
            print os.path.join(root, f)

Debo señalar que el código anterior asume excludes es un patrón, no un camino completo. Tendría que ajustar la comprensión de la lista para filtrar if os.path.join(root, d) not in excludes para que coincida con el caso OP.

 21
Author: kojiro,
Warning: date(): Invalid date.timezone value 'Europe/Kyiv', we selected the timezone 'UTC' for now. in /var/www/agent_stack/data/www/ajaxhispano.com/template/agent.layouts/content.php on line 61
2015-12-07 15:40:44

¿Por qué fnmatch?

import os
excludes=....
for ROOT,DIR,FILES in os.walk("/path"):
    for file in FILES:
       if file.endswith(('doc','odt')):
          print file
    for directory in DIR:
       if not directory in excludes :
          print directory

No sometido a pruebas exhaustivas

 6
Author: kurumi,
Warning: date(): Invalid date.timezone value 'Europe/Kyiv', we selected the timezone 'UTC' for now. in /var/www/agent_stack/data/www/ajaxhispano.com/template/agent.layouts/content.php on line 61
2011-02-28 11:42:38

Dirtools es perfecto para su caso de uso:

from dirtools import Dir

print(Dir('.', exclude_file='.gitignore').files())
 1
Author: michaeljoseph,
Warning: date(): Invalid date.timezone value 'Europe/Kyiv', we selected the timezone 'UTC' for now. in /var/www/agent_stack/data/www/ajaxhispano.com/template/agent.layouts/content.php on line 61
2014-07-18 21:42:20

Aquí hay una manera de hacer eso

import fnmatch
import os

excludes = ['/home/paulo-freitas/Documents']
matches = []
for path, dirs, files in os.walk(os.getcwd()):
    for eachpath in excludes:
        if eachpath in path:
            continue
    else:
        for result in [os.path.abspath(os.path.join(path, filename)) for
                filename in files if fnmatch.fnmatch(filename,'*.doc') or fnmatch.fnmatch(filename,'*.odt')]:
            matches.append(result)
print matches
 0
Author: Senthil Kumaran,
Warning: date(): Invalid date.timezone value 'Europe/Kyiv', we selected the timezone 'UTC' for now. in /var/www/agent_stack/data/www/ajaxhispano.com/template/agent.layouts/content.php on line 61
2011-02-28 12:44:41
import os
includes = ['*.doc', '*.odt']
excludes = ['/home/paulo-freitas/Documents']
def file_search(path, exe):
for x,y,z in os.walk(path):
    for a in z:
        if a[-4:] == exe:
            print os.path.join(x,a)
        for x in includes:
            file_search(excludes[0],x)
 0
Author: juniour,
Warning: date(): Invalid date.timezone value 'Europe/Kyiv', we selected the timezone 'UTC' for now. in /var/www/agent_stack/data/www/ajaxhispano.com/template/agent.layouts/content.php on line 61
2012-12-26 05:53:18

Este es un ejemplo de exclusión de directorios y archivos con os.walk():

ignoreDirPatterns=[".git"]
ignoreFilePatterns=[".php"]
def copyTree(src, dest, onerror=None):
    src = os.path.abspath(src)
    src_prefix = len(src) + len(os.path.sep)
    for root, dirs, files in os.walk(src, onerror=onerror):
        for pattern in ignoreDirPatterns:
            if pattern in root:
                break
        else:
            #If the above break didn't work, this part will be executed
            for file in files:
                for pattern in ignoreFilePatterns:
                    if pattern in file:
                        break
                else:
                    #If the above break didn't work, this part will be executed
                    dirpath = os.path.join(dest, root[src_prefix:])
                    try:
                        os.makedirs(dirpath,exist_ok=True)
                    except OSError as e:
                        if onerror is not None:
                            onerror(e)
                    filepath=os.path.join(root,file)
                    shutil.copy(filepath,dirpath)
                continue;#If the above else didn't executed, this will be reached

        continue;#If the above else didn't executed, this will be reached

Python >=3.2 debido a {[2] {} en[3]}

 0
Author: Jahid,
Warning: date(): Invalid date.timezone value 'Europe/Kyiv', we selected the timezone 'UTC' for now. in /var/www/agent_stack/data/www/ajaxhispano.com/template/agent.layouts/content.php on line 61
2015-05-11 18:57:56

Los métodos anteriores no habían funcionado para mí.

Entonces, esto es lo que se me ocurrió con una expansión de mi respuesta original a otra pregunta.

Lo que funcionó para mí fue:

if (not (str(root) + '/').startswith(tuple(exclude_foldr)))

Que compiló una ruta y excluyó la tupla de mis carpetas listadas.

Esto me dio el resultado exacto que estaba buscando.

Mi objetivo para esto era mantener mi mac organizada.

Puedo Buscar cualquier folder por path, locate & move específicos file.types, ignore subfolders y yo preventivamente prompt the user si want to move los archivos.

NOTA: el Prompt es solo una vez por ejecución y NO es por archivo

De forma predeterminada, el símbolo del sistema es NO cuando pulsa enter en lugar de [y/N], y solo mostrará los archivos Potential a mover.

Esto es solo un fragmento de mi GitHub Por favor visite para el script total.

SUGERENCIA: Lea el script a continuación, ya que agregué información por línea en cuanto a lo que lo había hecho.

#!/usr/bin/env python3
# =============================================================================
# Created On  : MAC OSX High Sierra 10.13.6 (17G65)
# Created On  : Python 3.7.0
# Created By  : Jeromie Kirchoff
# =============================================================================
"""THE MODULE HAS BEEN BUILD FOR KEEPING YOUR FILES ORGANIZED."""
# =============================================================================
from os import walk
from os import path
from shutil import move
import getpass
import click

mac_username = getpass.getuser()
includes_file_extensn = ([".jpg", ".gif", ".png", ".jpeg", ])
search_dir = path.dirname('/Users/' + mac_username + '/Documents/')
target_foldr = path.dirname('/Users/' + mac_username + '/Pictures/Archive/')
exclude_foldr = set([target_foldr,
                    path.dirname('/Users/' + mac_username +
                                 '/Documents/GitHub/'),
                     path.dirname('/Users/' + mac_username +
                                  '/Documents/Random/'),
                     path.dirname('/Users/' + mac_username +
                                  '/Documents/Stupid_Folder/'),
                     ])

if click.confirm("Would you like to move files?",
                 default=False):
    question_moving = True
else:
    question_moving = False


def organize_files():
    """THE MODULE HAS BEEN BUILD FOR KEEPING YOUR FILES ORGANIZED."""
    # topdown=True required for filtering.
    # "Root" had all info i needed to filter folders not dir...
    for root, dir, files in walk(search_dir, topdown=True):
        for file in files:
            # creating a directory to str and excluding folders that start with
            if (not (str(root) + '/').startswith(tuple(exclude_foldr))):
                # showcase only the file types looking for
                if (file.endswith(tuple(includes_file_extensn))):
                    # using path.normpath as i found an issue with double //
                    # in file paths.
                    filetomove = path.normpath(str(root) + '/' +
                                               str(file))
                    # forward slash required for both to split
                    movingfileto = path.normpath(str(target_foldr) + '/' +
                                                 str(file))
                    # Answering "NO" this only prints the files "TO BE Moved"
                    print('Files To Move: ' + str(filetomove))
                    # This is using the prompt you answered at the beginning
                    if question_moving is True:
                        print('Moving File: ' + str(filetomove) +
                              "\n To:" + str(movingfileto))
                        # This is the command that moves the file
                        move(filetomove, movingfileto)
                        pass

            # The rest is ignoring explicitly and continuing
                    else:
                        pass
                    pass
                else:
                    pass
            else:
                pass


if __name__ == '__main__':
    organize_files()

Ejemplo de ejecutar mi script desde terminal:

$ python3 organize_files.py
Exclude list: {'/Users/jkirchoff/Pictures/Archive', '/Users/jkirchoff/Documents/Stupid_Folder', '/Users/jkirchoff/Documents/Random', '/Users/jkirchoff/Documents/GitHub'}
Files found will be moved to this folder:/Users/jkirchoff/Pictures/Archive
Would you like to move files?
No? This will just list the files.
Yes? This will Move your files to the target folder.
[y/N]: 

Ejemplo de listado de archivos:

Files To Move: /Users/jkirchoff/Documents/Archive/JayWork/1.custom-award-768x512.jpg
Files To Move: /Users/jkirchoff/Documents/Archive/JayWork/10351458_318162838331056_9023492155204267542_n.jpg
...etc

Ejemplo de mover archivos:

Moving File: /Users/jkirchoff/Documents/Archive/JayWork/1.custom-award-768x512.jpg
To: /Users/jkirchoff/Pictures/Archive/1.custom-award-768x512.jpg
Moving File: /Users/jkirchoff/Documents/Archive/JayWork/10351458_318162838331056_9023492155204267542_n.jpg
To: /Users/jkirchoff/Pictures/Archive/10351458_318162838331056_9023492155204267542_n.jpg
...
 0
Author: JayRizzo,
Warning: date(): Invalid date.timezone value 'Europe/Kyiv', we selected the timezone 'UTC' for now. in /var/www/agent_stack/data/www/ajaxhispano.com/template/agent.layouts/content.php on line 61
2018-08-16 07:20:59