How to stop Pytest from appending to CSV created after initial test

Question

I am testing a class of functions that apply specific transformations to columns of a csv file retrieved from an S3 bucket. The test functions should retrieve the 'test_data.csv' file from the S3 bucket created using the levels_etl and levels_etl_with_test_csv_data fixtures create a new CSV with the transformations applied.

The problem I am having is that each of the test functions pass, when run individually, but when run as part of a class, the first test runs successfully, but all the other tests fail, because for some reason, instead of creating a new CSV with the transformations applied, the CSV output is appended to the CSV created in the previous test thus causing the assertions to fail, with each successive test appending to the CSV.

Setup Code:

@pytest.fixture
def levels_etl():
    # Mocking S3 connection start
    mock_bucket=mock_s3()
    mock_bucket.start()
    # Defining Class Arguments
    s3_access_key='AWS_ACCESS_KEY_ID'
    s3_secret_key='AWS_SECRET_ACCESS_KEY'
    s3_endpoint_url='https://s3.us-east-2.amazonaws.com'
    s3_bucket_name='test-bucket'
    # Creating s3 access keys as environment variables
    os.environ[s3_access_key]='KEY1'
    os.environ[s3_secret_key]='KEY2'
    s3=boto3.resource(service_name='s3',endpoint_url=s3_endpoint_url)
    s3.create_bucket(Bucket=s3_bucket_name, CreateBucketConfiguration={'LocationConstraint':'us-east-2'})
    # Creating Test instance
    s3_bucket_conn=S3BucketConnector(s3_access_key,s3_secret_key,s3_endpoint_url,s3_bucket_name)
    levels_etl=Levels_ETL(s3_bucket_conn)
    yield levels_etl
    # Teardown
    mock_bucket.stop()

@pytest.fixture
def levels_etl_with_test_csv_data(tmpdir_factory,levels_etl):
    filename=str(tmpdir_factory.mktemp('data').join('test_data.csv'))
    with open(filename,'w',encoding='UTF-8',newline='') as file:
        writer=csv.writer(file)
        writer.writerow(['date','company','location','title','level','specialisation','gender',
        'years_of_experience','years_at_company','base_salary','stock','bonus'])
        writer.writerows([['1/1/2017 11:33:27','Google','Sunnyvale, CA','Software Engineer','L3','android',
        'male','1','0','120000','40000','15000'],
        ['4/20/2017 11:33:27','Apple','Austin, TX','Software Engineer','ICT2','iOS Development','female','1','0',
        '90','30','20'],
        ['4/20/2017 11:33:27','Microsoft','Bellevue, WA','Product Manager','59','UX/UI','Male','0','0','0','0','0'],
        ['7/15/2017 11:33:27','Hubspot','Cambridge, MA, United States','Software Engineer','Junior',
        'Site Reliability (SRE)','','','','135','5','0'],
        ['10/11/2017 11:33:27','Facebook','Menlo Park, CA','Software Engineer','E5','production','male',
        '11','2','215','100','40'],
        ['10/11/2017 11:33:27','Facebook','Menlo Park, CA','Software Engineer','E5','production','male',
        '11','2','215','100','40'],
        ['12/11/2017 11:33:27','spotify','New York, NY','Software Engineer','Engineer 1','fullstack developer','male',
        '4','0','180','37.5','0'],
        ['1/30/2018 11:33:27','Intel','Santa Clara, CA','Software Engineer','grade 9','augmented reality','male',
        '20','5','204','50','20'],
        ['1/30/2018 11:33:27','Intel','Santa Clara, CA','Software Engineer','grade 9','virtual reality','male',
        '20','5','204','50','20'],
        ['3/30/2018 11:33:27','Netflix','Denver, CO','Software Engineer','E5','Web Development (front-end)','male',
        '20','2','591','0','0'],
        ['4/7/2018 11:33:27','Sony Interactive Entertainment','San Francisco, CA','Software Engineer','L4',
        'backend tools','male','6','6','103','5','32'],
        ['5/9/2018 11:33:27','Lyft','New York, NY','Data Scientist','t6','algorithms','male',
        '6','3','200','200','0'],
        ['11/11/2018 11:33:27','Hudson River Trading','New York, NY','Software Engineer','L4',
        'algorithm','male','6','4','431','0','1700'],
        ['4/7/2019 11:33:27','Facebook','Chicago, IL','Product Designer','IC4',
        'user experience','female','7','0','143','40','22.7'],
        ['4/7/2019 11:33:27','Facebook','New York, NY','Product Designer','IC4',
        'ux','female','7','2','173','40','0'],
        ['4/7/2019 11:33:27','Mango Voice','Salt Lake City, UT','Product Designer','l3',
        'ui','female','5','3','74.5','0','0'],
        ['9/13/2020 11:33:27','No Salary Startup','Chicago, IL','Product Designer','',
        'user interface','female','0','0','0','100','0'],
        ['4/7/2021 11:33:27','','Chicago, IL','','IC4','user experience','female','7','0','143','40','22.7'],
        ['4/7/2021 11:33:27','twitter','Washington, DC','software engineer','swe II',
        'data','male','2','2','150','60','0']])
    levels_etl.s3_bucket._bucket.upload_file(Filename=filename,Key='test_data.csv')
    yield levels_etl
    levels_etl.s3_bucket._bucket.delete_objects(Delete={
        'Objects':[
            {'Key':'test_data.csv'}
        ]
    })

Test Class Functions (2 of many)

def test_transform_job_data(self,levels_etl_with_test_csv_data):
        key_exp='test_data.csv'
        levels_etl_with_test_csv_data.transform_job_data(key=key_exp)
        jobdata_csv=levels_etl_with_test_csv_data.s3_bucket._bucket.Object(key='job_data.csv').get().get('Body').read().decode('UTF-8')
        print('jobdata_csv',jobdata_csv)
        job_data_df=pd.read_csv(StringIO(jobdata_csv))
        assert list(job_data_df.select_dtypes(include=['float']).columns)==['years_of_experience','years_at_company',
        'base_salary','stock','bonus']
        assert job_data_df.duplicated().any()==False
        assert ((job_data_df['base_salary']==0) & (job_data_df['stock']==0)).any()==False
        assert ((job_data_df['company']=='') & (job_data_df['title']=='')).any()==False
        assert job_data_df[job_data_df['company']=='Google']['base_salary'].values[0]==120000.00
        assert job_data_df[job_data_df['company']=='Google']['stock'].values[0]==40000.00
        assert job_data_df[job_data_df['company']=='Google']['bonus'].values[0]==15000.00
        assert job_data_df[job_data_df['company']=='Apple']['base_salary'].values[0]==90000.00
        assert job_data_df[job_data_df['company']=='Apple']['stock'].values[0]==30000.00
        assert job_data_df[job_data_df['company']=='Apple']['bonus'].values[0]==10000.00
    
    def test_transform_dates(self,levels_etl_with_test_csv_data):
        key_exp='test_data.csv'
        levels_etl_with_test_csv_data.transform_dates(key=key_exp)
        date_csv=levels_etl_with_test_csv_data.s3_bucket._bucket.Object(key='date.csv').get().get('Body').read().decode('UTF-8')
        print('date_csv',date_csv)
        date_df=pd.read_csv(StringIO(date_csv))
        assert list(date_df.columns)==['date','year','month','quarter']
        assert date_df['date'].tolist()==['2017-01-01','2017-04-20','2017-04-20','2017-07-15',
        '2017-10-11','2017-10-11','2017-12-11','2018-01-30','2018-01-30','2018-03-30','2018-04-07','2018-05-09',
        '2018-11-11','2019-04-07','2019-04-07','2019-04-07','2020-09-13','2021-04-07','2021-04-07']
        assert date_df['year'].tolist()==[2017,2017,2017,2017,2017,2017,2017,2018,2018,2018,2018,2018,2018,
        2019,2019,2019,2020,2021,2021]
        date_df['month'].tolist()==[1,4,4,7,10,10,12,1,1,3,4,5,11,4,4,4,9,4,4]
        assert date_df['quarter'].tolist()==[1,2,2,3,4,4,4,1,1,1,2,2,4,2,2,2,3,2,2]

The transform_job_data and transform_dates functions both retrieve the 'test_data.csv' file from the S3 bucket, apply pandas dataframe transformations and then convert back to CSV and upload new CSV to S3.

With the first test I get the expected CSV output:

jobdata_csv date,company,location,title,level,specialisation,gender,years_of_experience,years_at_company,base_salary,stock,bonus 1/1/2017 11:33:27,Google,"Sunnyvale, CA",Software Engineer,L3,android,male,1.0,0.0,120000.0,40000.0,15000.0 4/20/2017 11:33:27,Apple,"Austin, TX",Software Engineer,ICT2,iOS Development,female,1.0,0.0,90000.0,30000.0,20000.0 7/15/2017 11:33:27,Hubspot,"Cambridge, MA, United States",Software Engineer,Junior,Site Reliability (SRE),,,,135000.0,5000.0,0.0 10/11/2017 11:33:27,Facebook,"Menlo Park, CA",Software Engineer,E5,production,male,11.0,2.0,215000.0,100000.0,40000.0 12/11/2017 11:33:27,spotify,"New York, NY",Software Engineer,Engineer 1,fullstack developer,male,4.0,0.0,180000.0,37500.0,0.0 1/30/2018 11:33:27,Intel,"Santa Clara, CA",Software Engineer,grade 9,augmented reality,male,20.0,5.0,204000.0,50000.0,20000.0 1/30/2018 11:33:27,Intel,"Santa Clara, CA",Software Engineer,grade 9,virtual reality,male,20.0,5.0,204000.0,50000.0,20000.0 3/30/2018 11:33:27,Netflix,"Denver, CO",Software Engineer,E5,Web Development (front-end),male,20.0,2.0,591000.0,0.0,0.0 4/7/2018 11:33:27,Sony Interactive Entertainment,"San Francisco, CA",Software Engineer,L4,backend tools,male,6.0,6.0,103000.0,5000.0,32000.0 5/9/2018 11:33:27,Lyft,"New York, NY",Data Scientist,t6,algorithms,male,6.0,3.0,200000.0,200000.0,0.0 11/11/2018 11:33:27,Hudson River Trading,"New York, NY",Software Engineer,L4,algorithm,male,6.0,4.0,431000.0,0.0,1700000.0 4/7/2019 11:33:27,Facebook,"Chicago, IL",Product Designer,IC4,user experience,female,7.0,0.0,143000.0,40000.0,22700.0 4/7/2019 11:33:27,Facebook,"New York, NY",Product Designer,IC4,ux,female,7.0,2.0,173000.0,40000.0,0.0 4/7/2019 11:33:27,Mango Voice,"Salt Lake City, UT",Product Designer,l3,ui,female,5.0,3.0,74500.0,0.0,0.0 9/13/2020 11:33:27,No Salary Startup,"Chicago, IL",Product Designer,,user interface,female,0.0,0.0,0.0,100000.0,0.0 4/7/2021 11:33:27,,"Chicago, IL",,IC4,user experience,female,7.0,0.0,143000.0,40000.0,22700.0 4/7/2021 11:33:27,twitter,"Washington, DC",software engineer,swe II,data,male,2.0,2.0,150000.0,60000.0,0.0

But for the second one, it appends to the CSV from the prior test instead of creating CSV with date, year, month and quarter columns:

date_csv date,company,location,title,level,specialisation,gender,years_of_experience,years_at_company,base_salary,stock,bonus 1/1/2017 11:33:27,Google,"Sunnyvale, CA",Software Engineer,L3,android,male,1.0,0.0,120000.0,40000.0,15000.0 4/20/2017 11:33:27,Apple,"Austin, TX",Software Engineer,ICT2,iOS Development,female,1.0,0.0,90000.0,30000.0,20000.0 7/15/2017 11:33:27,Hubspot,"Cambridge, MA, United States",Software Engineer,Junior,Site Reliability (SRE),,,,135000.0,5000.0,0.0 10/11/2017 11:33:27,Facebook,"Menlo Park, CA",Software Engineer,E5,production,male,11.0,2.0,215000.0,100000.0,40000.0 12/11/2017 11:33:27,spotify,"New York, NY",Software Engineer,Engineer 1,fullstack developer,male,4.0,0.0,180000.0,37500.0,0.0 1/30/2018 11:33:27,Intel,"Santa Clara, CA",Software Engineer,grade 9,augmented reality,male,20.0,5.0,204000.0,50000.0,20000.0 1/30/2018 11:33:27,Intel,"Santa Clara, CA",Software Engineer,grade 9,virtual reality,male,20.0,5.0,204000.0,50000.0,20000.0 3/30/2018 11:33:27,Netflix,"Denver, CO",Software Engineer,E5,Web Development (front-end),male,20.0,2.0,591000.0,0.0,0.0 4/7/2018 11:33:27,Sony Interactive Entertainment,"San Francisco, CA",Software Engineer,L4,backend tools,male,6.0,6.0,103000.0,5000.0,32000.0 5/9/2018 11:33:27,Lyft,"New York, NY",Data Scientist,t6,algorithms,male,6.0,3.0,200000.0,200000.0,0.0 11/11/2018 11:33:27,Hudson River Trading,"New York, NY",Software Engineer,L4,algorithm,male,6.0,4.0,431000.0,0.0,1700000.0 4/7/2019 11:33:27,Facebook,"Chicago, IL",Product Designer,IC4,user experience,female,7.0,0.0,143000.0,40000.0,22700.0 4/7/2019 11:33:27,Facebook,"New York, NY",Product Designer,IC4,ux,female,7.0,2.0,173000.0,40000.0,0.0 4/7/2019 11:33:27,Mango Voice,"Salt Lake City, UT",Product Designer,l3,ui,female,5.0,3.0,74500.0,0.0,0.0 9/13/2020 11:33:27,No Salary Startup,"Chicago, IL",Product Designer,,user interface,female,0.0,0.0,0.0,100000.0,0.0 4/7/2021 11:33:27,,"Chicago, IL",,IC4,user experience,female,7.0,0.0,143000.0,40000.0,22700.0 4/7/2021 11:33:27,twitter,"Washington, DC",software engineer,swe II,data,male,2.0,2.0,150000.0,60000.0,0.0 date,year,month,quarter 2017-01-01,2017,1,1 2017-04-20,2017,4,2 2017-04-20,2017,4,2 2017-07-15,2017,7,3 2017-10-11,2017,10,4 2017-10-11,2017,10,4 2017-12-11,2017,12,4 2018-01-30,2018,1,1 2018-01-30,2018,1,1 2018-03-30,2018,3,1 2018-04-07,2018,4,2 2018-05-09,2018,5,2 2018-11-11,2018,11,4 2019-04-07,2019,4,2 2019-04-07,2019,4,2 2019-04-07,2019,4,2 2020-09-13,2020,9,3 2021-04-07,2021,4,2 2021-04-07,2021,4,2

I have tried modifying the scopes of the pytest fixtures between class, session and function but I am not getting the desired result. I added teardown code that deletes the 'test_data.csv' object after each test in the levels_etl_with_test_csv_data fixture but that has had no impact either.

Where is my issue coming from?

score 0 · Answer 1 · Apr 28, 2023

It sounds like the issue you're facing is that the CSV file created by the test is not being cleared or overwritten between test runs. One way to solve this issue is to add code to your test to ensure that the CSV file is cleared or overwritten before each test run.

One approach is to create a new temporary file for each test run. You can do this by using the tmpdir fixture provided by pytest. The tmpdir fixture creates a temporary directory that is unique to each test run. You can use this directory to create a new temporary file for your CSV output.