python - Split columns containing lists from CSV into separate CSV files with pandas - Stack Overflow

I have CSV files with multiple columns of data retrieved from APIs, where each cell may contain either

I have CSV files with multiple columns of data retrieved from APIs, where each cell may contain either a single value or a list/array. The size of these lists is consistent across each column (e.g., a column named ALPHANUMS having a row containing a list like "['A', 'B', '4']" has the same list size of a column named COLOR having a row containing a list "['red', 'blue', 'green']", but the list sizes can vary per CSV file depending on the API response. I would like to use pandas to create separate CSV files for each element in a list column, while retaining the rest of the data in each file.

Here's an example of what the data might look like from this mockup function:

import random
import csv

# Predefined lists for NAME, CARS, and PHONE OS
NAMES = ["John Doe", "Jane Smith", "Alice Johnson", "Bob Brown", "Charlie Davis", "Eve White", "David Wilson", "Emma Taylor", "Frank Harris", "Grace Clark"]
CAR_BRANDS = ["Toyota", "Ford", "BMW", "Tesla", "Honda", "Chevrolet", "Nissan", "Audi"]
PHONE_OS = ["Android", "iOS"]

def create_csv(file_name, num_records):
    cur_random_list_size = random.randint(1, min(len(NAMES), len(CAR_BRANDS)))
    with open(file_name, mode='w', newline='') as file:
        writer = csv.writer(file)
        
        writer.writerow(["ID", "NAME", "MONTH", "CARS", "PHONE OS"])

        for i in range(num_records):

            record = {
                "id" : i + 1,
                "name": [NAMES[n] for n in range(cur_random_list_size)],
                "month": random.randint(1,12),
                "cars": [random.choice(CAR_BRANDS) for _ in range(cur_random_list_size)],
                "phone": random.choice(PHONE_OS)
            }
            writer.writerow(record.values())
    
    print(f"CSV file '{file_name}' created with {num_records} records.")

create_csv("people_data.csv", 5)
ID NAME MONTH CARS PHONE OS
1 "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" 2 "['Toyota', 'Nissan', 'Nissan', 'Nissan', 'Audi', 'Honda']" iOS
2 "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" 4 "['Nissan', 'Ford', 'Honda', 'Toyota', 'Ford', 'Honda']" iOS
3 "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" 8 "['BMW', 'Honda', 'Tesla', 'Tesla', 'Tesla', 'Nissan']" Android
4 "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" 3 "['Tesla', 'Audi', 'Chevrolet', 'Audi', 'Chevrolet', 'BMW']" iOS
5 "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" 8 "['Ford', 'Tesla', 'BMW', 'Toyota', 'Nissan', 'Ford']" Android

I have CSV files with multiple columns of data retrieved from APIs, where each cell may contain either a single value or a list/array. The size of these lists is consistent across each column (e.g., a column named ALPHANUMS having a row containing a list like "['A', 'B', '4']" has the same list size of a column named COLOR having a row containing a list "['red', 'blue', 'green']", but the list sizes can vary per CSV file depending on the API response. I would like to use pandas to create separate CSV files for each element in a list column, while retaining the rest of the data in each file.

Here's an example of what the data might look like from this mockup function:

import random
import csv

# Predefined lists for NAME, CARS, and PHONE OS
NAMES = ["John Doe", "Jane Smith", "Alice Johnson", "Bob Brown", "Charlie Davis", "Eve White", "David Wilson", "Emma Taylor", "Frank Harris", "Grace Clark"]
CAR_BRANDS = ["Toyota", "Ford", "BMW", "Tesla", "Honda", "Chevrolet", "Nissan", "Audi"]
PHONE_OS = ["Android", "iOS"]

def create_csv(file_name, num_records):
    cur_random_list_size = random.randint(1, min(len(NAMES), len(CAR_BRANDS)))
    with open(file_name, mode='w', newline='') as file:
        writer = csv.writer(file)
        
        writer.writerow(["ID", "NAME", "MONTH", "CARS", "PHONE OS"])

        for i in range(num_records):

            record = {
                "id" : i + 1,
                "name": [NAMES[n] for n in range(cur_random_list_size)],
                "month": random.randint(1,12),
                "cars": [random.choice(CAR_BRANDS) for _ in range(cur_random_list_size)],
                "phone": random.choice(PHONE_OS)
            }
            writer.writerow(record.values())
    
    print(f"CSV file '{file_name}' created with {num_records} records.")

create_csv("people_data.csv", 5)
ID NAME MONTH CARS PHONE OS
1 "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" 2 "['Toyota', 'Nissan', 'Nissan', 'Nissan', 'Audi', 'Honda']" iOS
2 "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" 4 "['Nissan', 'Ford', 'Honda', 'Toyota', 'Ford', 'Honda']" iOS
3 "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" 8 "['BMW', 'Honda', 'Tesla', 'Tesla', 'Tesla', 'Nissan']" Android
4 "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" 3 "['Tesla', 'Audi', 'Chevrolet', 'Audi', 'Chevrolet', 'BMW']" iOS
5 "['John Doe', 'Jane Smith', 'Alice Johnson', 'Bob Brown', 'Charlie Davis', 'Eve White']" 8 "['Ford', 'Tesla', 'BMW', 'Toyota', 'Nissan', 'Ford']" Android

And ideally, I'd like to separate this into five individual csv files, as an example for john_doe_people_data.csv:

ID NAME MONTH CARS PHONE OS
1 John Doe 2 Toyota iOS
2 John Doe 4 Nissan iOS
3 John Doe 8 BMW Android
4 John Doe 3 Tesla iOS
5 John Doe 8 Ford Android

All in all, how can I use pandas to create separate CSV files for each element in a list column, while keeping the rest of the data in each file?

Share Improve this question edited Dec 3, 2024 at 13:55 kyrlon asked Nov 18, 2024 at 23:07 kyrlonkyrlon 1,3402 gold badges13 silver badges28 bronze badges 5
  • These lists - they are encoded as strings in the CSV. What format are they? JSON, or perhaps python (e.g., use ast.literal_eval)? – tdelaney Commented Nov 18, 2024 at 23:51
  • I think this would be easier with the CSV module. Create a dict mapping name to an empty table (defaultdict works for this). Iterate each row, break open the list and put the row (after some tidying) in that mapped list. Now each of those lists can be saved with csv.writer. So, are you absolutely wed to pandas? – tdelaney Commented Nov 18, 2024 at 23:54
  • @tdelaney yes they are encoded as strings – kyrlon Commented Nov 19, 2024 at 2:00
  • That they are strings is normal for a CSV. Its a question of how they are encoded. How do we convert that string into a list? I gave two common options. JSON makes sense, but your example isn't JSON. Python makes less sense (a python only solution in a CSV is limiting) but does decode your example. – tdelaney Commented Nov 19, 2024 at 2:31
  • 1 @kyrlon Please check my answer below that completes the task with a few lines of code. – LMC Commented Nov 19, 2024 at 20:54
Add a comment  | 

5 Answers 5

Reset to default 1

Another possible solution, whose steps are (where df is the dataframe resulting from reading the csv file):

  • Each column's string values are cleaned by removing square brackets using replace and then split into lists with split.

  • The resulting dataframe is then expanded with explode, transforming each element of the list-like columns into separate rows.

  • Finally, the dataframe is grouped by the NAME column using groupby, and a list comprehension is employed to generate a list of dataframes, each corresponding to a unique NAME.

[g for _, g in df.assign(**{
    x: df[x].str.replace(r'\[|\]', '', regex=True).str.split(',') 
    for x in ['NAME', 'CARS']})
 .explode(['NAME', 'CARS']).groupby('NAME')]

Columns containing list as strings can be easily parsed as json with a custom converter when reading the csv. Then a double explode() would complete the task

#import csv
import pandas as pd
from io import StringIO

def CustomParser(data):
    j1 = pd.read_json(StringIO(data.replace("'", '"')))

    return j1[0].values.tolist()

f1 = "/home/lmc/tmp/people_data.csv"
#create_csv(f1, 5)

df = pd.read_csv(f1, converters={'NAME':CustomParser, 'CARS':CustomParser},header=0)

df = df.explode('NAME').explode('CARS')
df_doe = df[df['NAME'] == 'John Doe']

print(df_doe)

Result for first item in NAMES

   ID      NAME  MONTH       CARS PHONE OS
0   1  John Doe     10      Honda  Android
0   1  John Doe     10     Toyota  Android
1   2  John Doe      9       Audi      iOS
1   2  John Doe      9      Honda      iOS
2   3  John Doe     12       Audi      iOS
2   3  John Doe     12      Honda      iOS
3   4  John Doe     10       Ford      iOS
3   4  John Doe     10        BMW      iOS
4   5  John Doe      5  Chevrolet      iOS
4   5  John Doe      5     Nissan      iOS

I believe the following gives what you are looking for:

import pandas as pd
import ast

def explode_list_columns(df, list_columns):
    """
    Input:
    df: pandas DataFrame
    list_columns: list of columns in the DataFrame that contain lists

    Output:
    df: pandas DataFrame with each element of the lists in the specified columns separated into different rows
    """
    for col in list_columns:
        df[col] = df[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    
    for col in list_columns:
        df = df.explode(col, ignore_index=True)
    
    return df

And then you just need to iterate over all rows that contain each separate name and write this to a .csv file like this for example

file_name = "people_data.csv"
df = pd.read_csv(file_name)
list_columns = ["NAME", "CARS"]
df_exploded = explode_list_columns(df, list_columns)

for name in df_exploded["NAME"].unique():
    name_df = df_exploded[df_exploded["NAME"] == name]
    output_file_name = f"{name.lower().replace(' ', '_')}_people_data.csv"

    name_df.to_csv(output_file_name, index=False)
# df = pd.read_csv('people_data.csv')
# df
# above df is reading the file created from your function

people = eval(df.iloc[0, 1])
len_people = len(people)
print(len_people)

for i in range(len_people):
    df1 = pd.DataFrame()
    df1[['ID','MONTH', 'PHONE OS']] = df[['ID','MONTH', 'PHONE OS']]
    # print(people[i])
    df1['NAME'] = people[i]
    # print(df['CARS'].apply(lambda x: eval(x)[i]))
    df1['CARS'] = df['CARS'].apply(lambda x: eval(x)[i])
    print(df1)
    # write your csv file here

I ended up using a combination of explode,map, and ast.literal_eval to break out the columns with string lists into different CSV files.

Instead of hard-coding column names like NAME or CARS, the program now dynamically checks which columns contain string representations of lists. This is done by iterating over all columns and using the map_check_if_list_literal function to identify list-like columns and later convert to literals with map_convert_list applied element-wise :

import random
import csv
import pandas as pd
import ast

# Predefined lists for NAME, CARS, and PHONE OS
NAMES = ["John Doe", "Jane Smith", "Alice Johnson", "Bob Brown", "Charlie Davis", "Eve White", "David Wilson", "Emma Taylor", "Frank Harris", "Grace Clark"]
CAR_BRANDS = ["Toyota", "Ford", "BMW", "Tesla", "Honda", "Chevrolet", "Nissan", "Audi"]
PHONE_OS = ["Android", "iOS"]

def create_csv(file_name, num_records):
    cur_random_list_size = random.randint(1, min(len(NAMES), len(CAR_BRANDS)))
    with open(file_name, mode='w', newline='') as file:
        writer = csv.writer(file)
        
        writer.writerow(["ID", "NAME", "MONTH", "CARS", "PHONE OS"])

        for i in range(num_records):

            record = {
                "id" : i + 1,
                "name": [NAMES[n] for n in range(cur_random_list_size)],
                "month": random.randint(1,12),
                "cars": [random.choice(CAR_BRANDS) for _ in range(cur_random_list_size)],
                "phone": random.choice(PHONE_OS)
            }
            writer.writerow(record.values())
    
    print(f"CSV file '{file_name}' created with {num_records} records.")

def map_check_if_list_literal(element):
    if isinstance(element,str):
        try:
            data = ast.literal_eval(element)
            if isinstance(data, list):
                return True
            else:
                return False
        except Exception as e:
            return False
    else:
        return False

def map_convert_list_literal(element):
    if isinstance(element,str):
        try:
            data = ast.literal_eval(element)
            if isinstance(data, list):
                return data
            else:
                return element
        except Exception as e:
            return element
    else:
        return element
    
if __name__ == "__main__":
    create_csv("people_data.csv", 5)

    file_name = "people_data.csv"
    df = pd.read_csv(file_name)
    temp_df = df.map(map_check_if_list_literal)
    columns_w_list = []
    for c in temp_df.columns:
        if temp_df[c].any():
            columns_w_list.append(c)
    
    new_df = df.map(map_convert_list_literal)
    new_df = new_df.explode(columns_w_list)
    
    #this is column of interest
    reference_column = ast.literal_eval(df["NAME"].mode()[0])

    for name in reference_column:
        mask = new_df["NAME"] == name
        unique_df = new_df[mask]
        unique_df.to_csv(f"{name}_{file_name}.csv", index=False)

发布者:admin,转转请注明出处:http://www.yc00.com/questions/1745589634a4634752.html

相关推荐

发表回复

评论列表(0条)

  • 暂无评论

联系我们

400-800-8888

在线咨询: QQ交谈

邮件:admin@example.com

工作时间:周一至周五,9:30-18:30,节假日休息

关注微信