About Kernel Documentation Linux Kernel Contact Linux Resources Linux Blog

Documentation / filesystems / Locking




Custom Search

Based on kernel version 3.9. Page generated on 2013-05-02 23:06 EST.

1		The text below describes the locking rules for VFS-related methods.
2	It is (believed to be) up-to-date. *Please*, if you change anything in
3	prototypes or locking protocols - update this file. And update the relevant
4	instances in the tree, don't leave that to maintainers of filesystems/devices/
5	etc. At the very least, put the list of dubious cases in the end of this file.
6	Don't turn it into log - maintainers of out-of-the-tree code are supposed to
7	be able to use diff(1).
8		Thing currently missing here: socket operations. Alexey?
9	
10	--------------------------- dentry_operations --------------------------
11	prototypes:
12		int (*d_revalidate)(struct dentry *, unsigned int);
13		int (*d_weak_revalidate)(struct dentry *, unsigned int);
14		int (*d_hash)(const struct dentry *, const struct inode *,
15				struct qstr *);
16		int (*d_compare)(const struct dentry *, const struct inode *,
17				const struct dentry *, const struct inode *,
18				unsigned int, const char *, const struct qstr *);
19		int (*d_delete)(struct dentry *);
20		void (*d_release)(struct dentry *);
21		void (*d_iput)(struct dentry *, struct inode *);
22		char *(*d_dname)((struct dentry *dentry, char *buffer, int buflen);
23		struct vfsmount *(*d_automount)(struct path *path);
24		int (*d_manage)(struct dentry *, bool);
25	
26	locking rules:
27			rename_lock	->d_lock	may block	rcu-walk
28	d_revalidate:	no		no		yes (ref-walk)	maybe
29	d_weak_revalidate:no		no		yes	 	no
30	d_hash		no		no		no		maybe
31	d_compare:	yes		no		no		maybe
32	d_delete:	no		yes		no		no
33	d_release:	no		no		yes		no
34	d_prune:        no              yes             no              no
35	d_iput:		no		no		yes		no
36	d_dname:	no		no		no		no
37	d_automount:	no		no		yes		no
38	d_manage:	no		no		yes (ref-walk)	maybe
39	
40	--------------------------- inode_operations --------------------------- 
41	prototypes:
42		int (*create) (struct inode *,struct dentry *,umode_t, bool);
43		struct dentry * (*lookup) (struct inode *,struct dentry *, unsigned int);
44		int (*link) (struct dentry *,struct inode *,struct dentry *);
45		int (*unlink) (struct inode *,struct dentry *);
46		int (*symlink) (struct inode *,struct dentry *,const char *);
47		int (*mkdir) (struct inode *,struct dentry *,umode_t);
48		int (*rmdir) (struct inode *,struct dentry *);
49		int (*mknod) (struct inode *,struct dentry *,umode_t,dev_t);
50		int (*rename) (struct inode *, struct dentry *,
51				struct inode *, struct dentry *);
52		int (*readlink) (struct dentry *, char __user *,int);
53		void * (*follow_link) (struct dentry *, struct nameidata *);
54		void (*put_link) (struct dentry *, struct nameidata *, void *);
55		void (*truncate) (struct inode *);
56		int (*permission) (struct inode *, int, unsigned int);
57		int (*get_acl)(struct inode *, int);
58		int (*setattr) (struct dentry *, struct iattr *);
59		int (*getattr) (struct vfsmount *, struct dentry *, struct kstat *);
60		int (*setxattr) (struct dentry *, const char *,const void *,size_t,int);
61		ssize_t (*getxattr) (struct dentry *, const char *, void *, size_t);
62		ssize_t (*listxattr) (struct dentry *, char *, size_t);
63		int (*removexattr) (struct dentry *, const char *);
64		int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len);
65		void (*update_time)(struct inode *, struct timespec *, int);
66		int (*atomic_open)(struct inode *, struct dentry *,
67					struct file *, unsigned open_flag,
68					umode_t create_mode, int *opened);
69	
70	locking rules:
71		all may block
72			i_mutex(inode)
73	lookup:		yes
74	create:		yes
75	link:		yes (both)
76	mknod:		yes
77	symlink:	yes
78	mkdir:		yes
79	unlink:		yes (both)
80	rmdir:		yes (both)	(see below)
81	rename:		yes (all)	(see below)
82	readlink:	no
83	follow_link:	no
84	put_link:	no
85	setattr:	yes
86	permission:	no (may not block if called in rcu-walk mode)
87	get_acl:	no
88	getattr:	no
89	setxattr:	yes
90	getxattr:	no
91	listxattr:	no
92	removexattr:	yes
93	fiemap:		no
94	update_time:	no
95	atomic_open:	yes
96	
97		Additionally, ->rmdir(), ->unlink() and ->rename() have ->i_mutex on
98	victim.
99		cross-directory ->rename() has (per-superblock) ->s_vfs_rename_sem.
100	
101	See Documentation/filesystems/directory-locking for more detailed discussion
102	of the locking scheme for directory operations.
103	
104	--------------------------- super_operations ---------------------------
105	prototypes:
106		struct inode *(*alloc_inode)(struct super_block *sb);
107		void (*destroy_inode)(struct inode *);
108		void (*dirty_inode) (struct inode *, int flags);
109		int (*write_inode) (struct inode *, struct writeback_control *wbc);
110		int (*drop_inode) (struct inode *);
111		void (*evict_inode) (struct inode *);
112		void (*put_super) (struct super_block *);
113		int (*sync_fs)(struct super_block *sb, int wait);
114		int (*freeze_fs) (struct super_block *);
115		int (*unfreeze_fs) (struct super_block *);
116		int (*statfs) (struct dentry *, struct kstatfs *);
117		int (*remount_fs) (struct super_block *, int *, char *);
118		void (*umount_begin) (struct super_block *);
119		int (*show_options)(struct seq_file *, struct dentry *);
120		ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t);
121		ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t);
122		int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t);
123	
124	locking rules:
125		All may block [not true, see below]
126				s_umount
127	alloc_inode:
128	destroy_inode:
129	dirty_inode:
130	write_inode:
131	drop_inode:				!!!inode->i_lock!!!
132	evict_inode:
133	put_super:		write
134	sync_fs:		read
135	freeze_fs:		write
136	unfreeze_fs:		write
137	statfs:			maybe(read)	(see below)
138	remount_fs:		write
139	umount_begin:		no
140	show_options:		no		(namespace_sem)
141	quota_read:		no		(see below)
142	quota_write:		no		(see below)
143	bdev_try_to_free_page:	no		(see below)
144	
145	->statfs() has s_umount (shared) when called by ustat(2) (native or
146	compat), but that's an accident of bad API; s_umount is used to pin
147	the superblock down when we only have dev_t given us by userland to
148	identify the superblock.  Everything else (statfs(), fstatfs(), etc.)
149	doesn't hold it when calling ->statfs() - superblock is pinned down
150	by resolving the pathname passed to syscall.
151	->quota_read() and ->quota_write() functions are both guaranteed to
152	be the only ones operating on the quota file by the quota code (via
153	dqio_sem) (unless an admin really wants to screw up something and
154	writes to quota files with quotas on). For other details about locking
155	see also dquot_operations section.
156	->bdev_try_to_free_page is called from the ->releasepage handler of
157	the block device inode.  See there for more details.
158	
159	--------------------------- file_system_type ---------------------------
160	prototypes:
161		int (*get_sb) (struct file_system_type *, int,
162			       const char *, void *, struct vfsmount *);
163		struct dentry *(*mount) (struct file_system_type *, int,
164			       const char *, void *);
165		void (*kill_sb) (struct super_block *);
166	locking rules:
167			may block
168	mount		yes
169	kill_sb		yes
170	
171	->mount() returns ERR_PTR or the root dentry; its superblock should be locked
172	on return.
173	->kill_sb() takes a write-locked superblock, does all shutdown work on it,
174	unlocks and drops the reference.
175	
176	--------------------------- address_space_operations --------------------------
177	prototypes:
178		int (*writepage)(struct page *page, struct writeback_control *wbc);
179		int (*readpage)(struct file *, struct page *);
180		int (*sync_page)(struct page *);
181		int (*writepages)(struct address_space *, struct writeback_control *);
182		int (*set_page_dirty)(struct page *page);
183		int (*readpages)(struct file *filp, struct address_space *mapping,
184				struct list_head *pages, unsigned nr_pages);
185		int (*write_begin)(struct file *, struct address_space *mapping,
186					loff_t pos, unsigned len, unsigned flags,
187					struct page **pagep, void **fsdata);
188		int (*write_end)(struct file *, struct address_space *mapping,
189					loff_t pos, unsigned len, unsigned copied,
190					struct page *page, void *fsdata);
191		sector_t (*bmap)(struct address_space *, sector_t);
192		int (*invalidatepage) (struct page *, unsigned long);
193		int (*releasepage) (struct page *, int);
194		void (*freepage)(struct page *);
195		int (*direct_IO)(int, struct kiocb *, const struct iovec *iov,
196				loff_t offset, unsigned long nr_segs);
197		int (*get_xip_mem)(struct address_space *, pgoff_t, int, void **,
198					unsigned long *);
199		int (*migratepage)(struct address_space *, struct page *, struct page *);
200		int (*launder_page)(struct page *);
201		int (*is_partially_uptodate)(struct page *, read_descriptor_t *, unsigned long);
202		int (*error_remove_page)(struct address_space *, struct page *);
203		int (*swap_activate)(struct file *);
204		int (*swap_deactivate)(struct file *);
205	
206	locking rules:
207		All except set_page_dirty and freepage may block
208	
209				PageLocked(page)	i_mutex
210	writepage:		yes, unlocks (see below)
211	readpage:		yes, unlocks
212	sync_page:		maybe
213	writepages:
214	set_page_dirty		no
215	readpages:
216	write_begin:		locks the page		yes
217	write_end:		yes, unlocks		yes
218	bmap:
219	invalidatepage:		yes
220	releasepage:		yes
221	freepage:		yes
222	direct_IO:
223	get_xip_mem:					maybe
224	migratepage:		yes (both)
225	launder_page:		yes
226	is_partially_uptodate:	yes
227	error_remove_page:	yes
228	swap_activate:		no
229	swap_deactivate:	no
230	
231		->write_begin(), ->write_end(), ->sync_page() and ->readpage()
232	may be called from the request handler (/dev/loop).
233	
234		->readpage() unlocks the page, either synchronously or via I/O
235	completion.
236	
237		->readpages() populates the pagecache with the passed pages and starts
238	I/O against them.  They come unlocked upon I/O completion.
239	
240		->writepage() is used for two purposes: for "memory cleansing" and for
241	"sync".  These are quite different operations and the behaviour may differ
242	depending upon the mode.
243	
244	If writepage is called for sync (wbc->sync_mode != WBC_SYNC_NONE) then
245	it *must* start I/O against the page, even if that would involve
246	blocking on in-progress I/O.
247	
248	If writepage is called for memory cleansing (sync_mode ==
249	WBC_SYNC_NONE) then its role is to get as much writeout underway as
250	possible.  So writepage should try to avoid blocking against
251	currently-in-progress I/O.
252	
253	If the filesystem is not called for "sync" and it determines that it
254	would need to block against in-progress I/O to be able to start new I/O
255	against the page the filesystem should redirty the page with
256	redirty_page_for_writepage(), then unlock the page and return zero.
257	This may also be done to avoid internal deadlocks, but rarely.
258	
259	If the filesystem is called for sync then it must wait on any
260	in-progress I/O and then start new I/O.
261	
262	The filesystem should unlock the page synchronously, before returning to the
263	caller, unless ->writepage() returns special WRITEPAGE_ACTIVATE
264	value. WRITEPAGE_ACTIVATE means that page cannot really be written out
265	currently, and VM should stop calling ->writepage() on this page for some
266	time. VM does this by moving page to the head of the active list, hence the
267	name.
268	
269	Unless the filesystem is going to redirty_page_for_writepage(), unlock the page
270	and return zero, writepage *must* run set_page_writeback() against the page,
271	followed by unlocking it.  Once set_page_writeback() has been run against the
272	page, write I/O can be submitted and the write I/O completion handler must run
273	end_page_writeback() once the I/O is complete.  If no I/O is submitted, the
274	filesystem must run end_page_writeback() against the page before returning from
275	writepage.
276	
277	That is: after 2.5.12, pages which are under writeout are *not* locked.  Note,
278	if the filesystem needs the page to be locked during writeout, that is ok, too,
279	the page is allowed to be unlocked at any point in time between the calls to
280	set_page_writeback() and end_page_writeback().
281	
282	Note, failure to run either redirty_page_for_writepage() or the combination of
283	set_page_writeback()/end_page_writeback() on a page submitted to writepage
284	will leave the page itself marked clean but it will be tagged as dirty in the
285	radix tree.  This incoherency can lead to all sorts of hard-to-debug problems
286	in the filesystem like having dirty inodes at umount and losing written data.
287	
288		->sync_page() locking rules are not well-defined - usually it is called
289	with lock on page, but that is not guaranteed. Considering the currently
290	existing instances of this method ->sync_page() itself doesn't look
291	well-defined...
292	
293		->writepages() is used for periodic writeback and for syscall-initiated
294	sync operations.  The address_space should start I/O against at least
295	*nr_to_write pages.  *nr_to_write must be decremented for each page which is
296	written.  The address_space implementation may write more (or less) pages
297	than *nr_to_write asks for, but it should try to be reasonably close.  If
298	nr_to_write is NULL, all dirty pages must be written.
299	
300	writepages should _only_ write pages which are present on
301	mapping->io_pages.
302	
303		->set_page_dirty() is called from various places in the kernel
304	when the target page is marked as needing writeback.  It may be called
305	under spinlock (it cannot block) and is sometimes called with the page
306	not locked.
307	
308		->bmap() is currently used by legacy ioctl() (FIBMAP) provided by some
309	filesystems and by the swapper. The latter will eventually go away.  Please,
310	keep it that way and don't breed new callers.
311	
312		->invalidatepage() is called when the filesystem must attempt to drop
313	some or all of the buffers from the page when it is being truncated.  It
314	returns zero on success.  If ->invalidatepage is zero, the kernel uses
315	block_invalidatepage() instead.
316	
317		->releasepage() is called when the kernel is about to try to drop the
318	buffers from the page in preparation for freeing it.  It returns zero to
319	indicate that the buffers are (or may be) freeable.  If ->releasepage is zero,
320	the kernel assumes that the fs has no private interest in the buffers.
321	
322		->freepage() is called when the kernel is done dropping the page
323	from the page cache.
324	
325		->launder_page() may be called prior to releasing a page if
326	it is still found to be dirty. It returns zero if the page was successfully
327	cleaned, or an error value if not. Note that in order to prevent the page
328	getting mapped back in and redirtied, it needs to be kept locked
329	across the entire operation.
330	
331		->swap_activate will be called with a non-zero argument on
332	files backing (non block device backed) swapfiles. A return value
333	of zero indicates success, in which case this file can be used for
334	backing swapspace. The swapspace operations will be proxied to the
335	address space operations.
336	
337		->swap_deactivate() will be called in the sys_swapoff()
338	path after ->swap_activate() returned success.
339	
340	----------------------- file_lock_operations ------------------------------
341	prototypes:
342		void (*fl_copy_lock)(struct file_lock *, struct file_lock *);
343		void (*fl_release_private)(struct file_lock *);
344	
345	
346	locking rules:
347				file_lock_lock	may block
348	fl_copy_lock:		yes		no
349	fl_release_private:	maybe		no
350	
351	----------------------- lock_manager_operations ---------------------------
352	prototypes:
353		int (*lm_compare_owner)(struct file_lock *, struct file_lock *);
354		void (*lm_notify)(struct file_lock *);  /* unblock callback */
355		int (*lm_grant)(struct file_lock *, struct file_lock *, int);
356		void (*lm_break)(struct file_lock *); /* break_lease callback */
357		int (*lm_change)(struct file_lock **, int);
358	
359	locking rules:
360				file_lock_lock	may block
361	lm_compare_owner:	yes		no
362	lm_notify:		yes		no
363	lm_grant:		no		no
364	lm_break:		yes		no
365	lm_change		yes		no
366	
367	--------------------------- buffer_head -----------------------------------
368	prototypes:
369		void (*b_end_io)(struct buffer_head *bh, int uptodate);
370	
371	locking rules:
372		called from interrupts. In other words, extreme care is needed here.
373	bh is locked, but that's all warranties we have here. Currently only RAID1,
374	highmem, fs/buffer.c, and fs/ntfs/aops.c are providing these. Block devices
375	call this method upon the IO completion.
376	
377	--------------------------- block_device_operations -----------------------
378	prototypes:
379		int (*open) (struct block_device *, fmode_t);
380		int (*release) (struct gendisk *, fmode_t);
381		int (*ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
382		int (*compat_ioctl) (struct block_device *, fmode_t, unsigned, unsigned long);
383		int (*direct_access) (struct block_device *, sector_t, void **, unsigned long *);
384		int (*media_changed) (struct gendisk *);
385		void (*unlock_native_capacity) (struct gendisk *);
386		int (*revalidate_disk) (struct gendisk *);
387		int (*getgeo)(struct block_device *, struct hd_geometry *);
388		void (*swap_slot_free_notify) (struct block_device *, unsigned long);
389	
390	locking rules:
391				bd_mutex
392	open:			yes
393	release:		yes
394	ioctl:			no
395	compat_ioctl:		no
396	direct_access:		no
397	media_changed:		no
398	unlock_native_capacity:	no
399	revalidate_disk:	no
400	getgeo:			no
401	swap_slot_free_notify:	no	(see below)
402	
403	media_changed, unlock_native_capacity and revalidate_disk are called only from
404	check_disk_change().
405	
406	swap_slot_free_notify is called with swap_lock and sometimes the page lock
407	held.
408	
409	
410	--------------------------- file_operations -------------------------------
411	prototypes:
412		loff_t (*llseek) (struct file *, loff_t, int);
413		ssize_t (*read) (struct file *, char __user *, size_t, loff_t *);
414		ssize_t (*write) (struct file *, const char __user *, size_t, loff_t *);
415		ssize_t (*aio_read) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
416		ssize_t (*aio_write) (struct kiocb *, const struct iovec *, unsigned long, loff_t);
417		int (*readdir) (struct file *, void *, filldir_t);
418		unsigned int (*poll) (struct file *, struct poll_table_struct *);
419		long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
420		long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
421		int (*mmap) (struct file *, struct vm_area_struct *);
422		int (*open) (struct inode *, struct file *);
423		int (*flush) (struct file *);
424		int (*release) (struct inode *, struct file *);
425		int (*fsync) (struct file *, loff_t start, loff_t end, int datasync);
426		int (*aio_fsync) (struct kiocb *, int datasync);
427		int (*fasync) (int, struct file *, int);
428		int (*lock) (struct file *, int, struct file_lock *);
429		ssize_t (*readv) (struct file *, const struct iovec *, unsigned long,
430				loff_t *);
431		ssize_t (*writev) (struct file *, const struct iovec *, unsigned long,
432				loff_t *);
433		ssize_t (*sendfile) (struct file *, loff_t *, size_t, read_actor_t,
434				void __user *);
435		ssize_t (*sendpage) (struct file *, struct page *, int, size_t,
436				loff_t *, int);
437		unsigned long (*get_unmapped_area)(struct file *, unsigned long,
438				unsigned long, unsigned long, unsigned long);
439		int (*check_flags)(int);
440		int (*flock) (struct file *, int, struct file_lock *);
441		ssize_t (*splice_write)(struct pipe_inode_info *, struct file *, loff_t *,
442				size_t, unsigned int);
443		ssize_t (*splice_read)(struct file *, loff_t *, struct pipe_inode_info *,
444				size_t, unsigned int);
445		int (*setlease)(struct file *, long, struct file_lock **);
446		long (*fallocate)(struct file *, int, loff_t, loff_t);
447	};
448	
449	locking rules:
450		All may block except for ->setlease.
451		No VFS locks held on entry except for ->setlease.
452	
453	->setlease has the file_list_lock held and must not sleep.
454	
455	->llseek() locking has moved from llseek to the individual llseek
456	implementations.  If your fs is not using generic_file_llseek, you
457	need to acquire and release the appropriate locks in your ->llseek().
458	For many filesystems, it is probably safe to acquire the inode
459	mutex or just to use i_size_read() instead.
460	Note: this does not protect the file->f_pos against concurrent modifications
461	since this is something the userspace has to take care about.
462	
463	->fasync() is responsible for maintaining the FASYNC bit in filp->f_flags.
464	Most instances call fasync_helper(), which does that maintenance, so it's
465	not normally something one needs to worry about.  Return values > 0 will be
466	mapped to zero in the VFS layer.
467	
468	->readdir() and ->ioctl() on directories must be changed. Ideally we would
469	move ->readdir() to inode_operations and use a separate method for directory
470	->ioctl() or kill the latter completely. One of the problems is that for
471	anything that resembles union-mount we won't have a struct file for all
472	components. And there are other reasons why the current interface is a mess...
473	
474	->read on directories probably must go away - we should just enforce -EISDIR
475	in sys_read() and friends.
476	
477	--------------------------- dquot_operations -------------------------------
478	prototypes:
479		int (*write_dquot) (struct dquot *);
480		int (*acquire_dquot) (struct dquot *);
481		int (*release_dquot) (struct dquot *);
482		int (*mark_dirty) (struct dquot *);
483		int (*write_info) (struct super_block *, int);
484	
485	These operations are intended to be more or less wrapping functions that ensure
486	a proper locking wrt the filesystem and call the generic quota operations.
487	
488	What filesystem should expect from the generic quota functions:
489	
490			FS recursion	Held locks when called
491	write_dquot:	yes		dqonoff_sem or dqptr_sem
492	acquire_dquot:	yes		dqonoff_sem or dqptr_sem
493	release_dquot:	yes		dqonoff_sem or dqptr_sem
494	mark_dirty:	no		-
495	write_info:	yes		dqonoff_sem
496	
497	FS recursion means calling ->quota_read() and ->quota_write() from superblock
498	operations.
499	
500	More details about quota locking can be found in fs/dquot.c.
501	
502	--------------------------- vm_operations_struct -----------------------------
503	prototypes:
504		void (*open)(struct vm_area_struct*);
505		void (*close)(struct vm_area_struct*);
506		int (*fault)(struct vm_area_struct*, struct vm_fault *);
507		int (*page_mkwrite)(struct vm_area_struct *, struct vm_fault *);
508		int (*access)(struct vm_area_struct *, unsigned long, void*, int, int);
509	
510	locking rules:
511			mmap_sem	PageLocked(page)
512	open:		yes
513	close:		yes
514	fault:		yes		can return with page locked
515	page_mkwrite:	yes		can return with page locked
516	access:		yes
517	
518		->fault() is called when a previously not present pte is about
519	to be faulted in. The filesystem must find and return the page associated
520	with the passed in "pgoff" in the vm_fault structure. If it is possible that
521	the page may be truncated and/or invalidated, then the filesystem must lock
522	the page, then ensure it is not already truncated (the page lock will block
523	subsequent truncate), and then return with VM_FAULT_LOCKED, and the page
524	locked. The VM will unlock the page.
525	
526		->page_mkwrite() is called when a previously read-only pte is
527	about to become writeable. The filesystem again must ensure that there are
528	no truncate/invalidate races, and then return with the page locked. If
529	the page has been truncated, the filesystem should not look up a new page
530	like the ->fault() handler, but simply return with VM_FAULT_NOPAGE, which
531	will cause the VM to retry the fault.
532	
533		->access() is called when get_user_pages() fails in
534	acces_process_vm(), typically used to debug a process through
535	/proc/pid/mem or ptrace.  This function is needed only for
536	VM_IO | VM_PFNMAP VMAs.
537	
538	================================================================================
539				Dubious stuff
540	
541	(if you break something or notice that it is broken and do not fix it yourself
542	- at least put it here)
Hide Line Numbers
About Kernel Documentation Linux Kernel Contact Linux Resources Linux Blog

Information is copyright its respective author. All material is available from the Linux Kernel Source distributed under a GPL License. This page is provided as a free service by mjmwired.net.