Fine-Tuning GPT-3 Using the OpenAI API and Python
  • AI Chat
  • Code
  • Report
  • Beta
    Spinner
    pip install --upgrade openai -q
    import os
    from openai import OpenAI
    
    client = OpenAI(
      api_key=os.environ['OPENAI_API_KEY'],
    )

    Dataset

    training_data = [
    	{
        	"prompt": "What is the capital of France?->",
        	"completion": """ The capital of France is Paris.\n"""
    	},
    	{
        	"prompt": "What is the primary function of the heart?->",
        	"completion": """ The primary function of the heart is to pump blood throughout the body.\n"""
    	},
    	{
        	"prompt": "What is photosynthesis?->",
        	"completion": """ Photosynthesis is the process by which green plants and some other organisms convert sunlight into chemical energy stored in the form of glucose.\n"""
    	},
    	{
        	"prompt": "Who wrote the play 'Romeo and Juliet'?->",
        	"completion": """ William Shakespeare wrote the play 'Romeo and Juliet'.\n"""
    	},
    	{
        	"prompt": "Which element has the atomic number 1?->",
        	"completion": """ Hydrogen has the atomic number 1.\n"""
    	},
    	{
        	"prompt": "What is the largest planet in our solar system?->",
        	"completion": """ Jupiter is the largest planet in our solar system.\n"""
    	},
    	{
        	"prompt": "What is the freezing point of water in Celsius?->",
        	"completion": """ The freezing point of water in Celsius is 0 degrees.\n"""
    	},
    	{
        	"prompt": "What is the square root of 144?->",
        	"completion": """ The square root of 144 is 12.\n"""
    	},
    	{
        	"prompt": "Who is the author of 'To Kill a Mockingbird'?->",
        	"completion": """ The author of 'To Kill a Mockingbird' is Harper Lee.\n"""
    	},
    	{
        	"prompt": "What is the smallest unit of life?->",
        	"completion": """ The smallest unit of life is the cell.\n"""
    	}
    ]
    
    validation_data = [
    	{
        	"prompt": "Which gas do plants use for photosynthesis?->",
        	"completion": """ Plants use carbon dioxide for photosynthesis.\n"""
    	},
    	{
        	"prompt": "What are the three primary colors of light?->",
        	"completion": """ The three primary colors of light are red, green, and blue.\n"""
    	},
    	{
        	"prompt": "Who discovered penicillin?->",
        	"completion": """ Sir Alexander Fleming discovered penicillin.\n"""
    	},
    	{
        	"prompt": "What is the chemical formula for water?->",
        	"completion": """ The chemical formula for water is H2O.\n"""
    	},
    	{
        	"prompt": "What is the largest country by land area?->",
        	"completion": """ Russia is the largest country by land area.\n"""
    	},
    	{
        	"prompt": "What is the speed of light in a vacuum?->",
        	"completion": """ The speed of light in a vacuum is approximately 299,792 kilometers per second.\n"""
    	},
    	{
        	"prompt": "What is the currency of Japan?->",
        	"completion": """ The currency of Japan is the Japanese Yen.\n"""
    	},
    	{
        	"prompt": "What is the smallest bone in the human body?->",
        	"completion": """ The stapes, located in the middle ear, is the smallest bone in the human body.\n"""
    	}
    ]

    Saving the Dataset

    import json
    
    training_file_name = "training_data.jsonl"
    validation_file_name = "validation_data.jsonl"
    
    def prepare_data(dictionary_data, final_file_name):
        with open(final_file_name, 'w') as outfile:
            for entry in dictionary_data:
            	json.dump(entry, outfile)
            	outfile.write('\n')
    
    prepare_data(training_data, "training_data.jsonl")
    prepare_data(validation_data, "validation_data.jsonl")

    Uploading the Dataset

    training_file_id = client.files.create(
      file=open(training_file_name, "rb"),
      purpose="fine-tune"
    )
    
    validation_file_id = client.files.create(
      file=open(validation_file_name, "rb"),
      purpose="fine-tune"
    )
    
    print(f"Training File ID: {training_file_id}")
    print(f"Validation File ID: {validation_file_id}")
    training_file_id.id

    Finetuning

    response = client.fine_tuning.jobs.create(
      training_file=training_file_id.id, 
      validation_file=validation_file_id.id,
      model="davinci-002", 
      hyperparameters={
        "n_epochs": 15,
    	"batch_size": 3,
    	"learning_rate_multiplier": 0.3
      }
    )
    job_id = response.id
    status = response.status
    
    print(f'Fine-tunning model with jobID: {job_id}.')
    print(f"Training Response: {response}")
    print(f"Training Status: {status}")

    Monitoring the Jobs

    import signal
    import datetime
    
    
    def signal_handler(sig, frame):
        status = client.fine_tuning.jobs.retrieve(job_id).status
        print(f"Stream interrupted. Job is still {status}.")
        return
    
    
    print(f"Streaming events for the fine-tuning job: {job_id}")
    
    signal.signal(signal.SIGINT, signal_handler)
    
    events = client.fine_tuning.jobs.list_events(fine_tuning_job_id=job_id)
    try:
        for event in events:
            print(
                f'{datetime.datetime.fromtimestamp(event.created_at)} {event.message}'
            )
    except Exception:
        print("Stream interrupted (client disconnected).")
    
    import time
    
    status = client.fine_tuning.jobs.retrieve(job_id).status
    if status not in ["succeeded", "failed"]:
        print(f"Job not in terminal status: {status}. Waiting.")
        while status not in ["succeeded", "failed"]:
            time.sleep(2)
            status = client.fine_tuning.jobs.retrieve(job_id).status
            print(f"Status: {status}")
    else:
        print(f"Finetune job {job_id} finished with status: {status}")
    print("Checking other finetune jobs in the subscription.")
    result = client.fine_tuning.jobs.list()
    print(f"Found {len(result.data)} finetune jobs.")